Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122980
D6240.id22696.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D6240.id22696.diff
View Options
diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -8,7 +8,7 @@
import os
from shutil import rmtree
from tempfile import mkdtemp
-from typing import Deque, Dict, List, Optional, Set, Tuple, TypeVar, Union
+from typing import Deque, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders
@@ -36,7 +36,7 @@
from . import hgutil
from .archive_extract import tmp_extract
-from .hgutil import HgFilteredSet, HgNodeId, HgSpanSet
+from .hgutil import HgNodeId
FLAG_PERMS = {
b"l": DentryPerms.symlink,
@@ -289,6 +289,28 @@
]
return extids
+ def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:
+ """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to
+ a known revision.
+
+ """
+ extids = [
+ extid
+ for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids)
+ if extid.extid_version == EXTID_VERSION
+ ]
+ if extids:
+ # Filter out dangling extids, we need to load their target again
+ revisions_missing = self.storage.revision_missing(
+ [extid.target.object_id for extid in extids]
+ )
+ extids = [
+ extid
+ for extid in extids
+ if extid.target.object_id not in revisions_missing
+ ]
+ return extids
+
def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading
@@ -317,35 +339,55 @@
return False
- def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
- """Return the hg revision numbers to load."""
+ def _new_revs(self, heads: List[bytes]):
+ """Return unseen revisions. That is, filter out revisions that are not ancestors of
+ heads"""
+ assert self._repo is not None
+ existing_heads = []
+
+ for hg_nodeid in heads:
+ try:
+ rev = self._repo[hg_nodeid].rev()
+ existing_heads.append(rev)
+ except KeyError: # the node does not exist anymore
+ pass
+
+ # select revisions that are not ancestors of heads
+ # and not the heads themselves
+ new_revs = self._repo.revs("not ::(%ld)", existing_heads)
+
+ if new_revs:
+ self.log.info("New revisions found: %d", len(new_revs))
+
+ return new_revs
+
+ def get_hg_revs_to_load(self) -> Iterator[int]:
+ """Yield hg revision numbers to load.
+
+ """
assert self._repo is not None
repo: hgutil.Repository = self._repo
+
+ seen_revs: Set[int] = set()
+ # 1. use snapshot to reuse existing seen heads from it
if self._latest_heads:
- existing_heads = [] # heads that still exist in the repository
- for hg_nodeid in self._latest_heads:
- try:
- rev = repo[hg_nodeid].rev()
- existing_heads.append(rev)
- except KeyError: # the node does not exist anymore
- pass
-
- # select revisions that are not ancestors of heads
- # and not the heads themselves
- new_revs = repo.revs("not ::(%ld)", existing_heads)
-
- if new_revs:
- self.log.info("New revisions found: %d", len(new_revs))
- return new_revs
- else:
- return repo.revs("all()")
+ for rev in self._new_revs(self._latest_heads):
+ seen_revs.add(rev)
+ yield rev
+
+ # 2. Then filter out remaining revisions through the overall extid mappings
+ # across hg origins
+ revs_left = repo.revs("all() - ::(%ld)", seen_revs)
+ hg_nodeids = [repo[nodeid].node() for nodeid in revs_left]
+ for rev in self._new_revs(
+ [extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)]
+ ):
+ yield rev
def store_data(self):
"""Store fetched data in the database."""
revs = self.get_hg_revs_to_load()
- if not revs:
- self._load_status = "uneventful"
- return
+ length_ingested_revs = 0
assert self._repo is not None
repo = self._repo
@@ -356,14 +398,15 @@
continue
try:
self.store_revision(repo[rev])
+ length_ingested_revs += 1
except CorruptedRevision as e:
self._visit_status = "partial"
self.log.warning("Corrupted revision %s", e)
descendents = repo.revs("(%ld)::", [rev])
ignored_revs.update(descendents)
- if len(ignored_revs) == len(revs):
- # The repository is completely broken, nothing can be loaded
+ if length_ingested_revs == 0:
+ # The repository has nothing to ingest (either empty or broken repository)
self._load_status = "uneventful"
return
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -687,3 +687,61 @@
loader = HgLoaderFromDisk(swh_storage, repo_path)
assert loader.load() == {"status": "uneventful"}
+
+
+def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path):
+ """The first visit of a fork should filter already seen revisions (through extids)
+
+ """
+ archive_name = "the-sandbox"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ loader = HgLoaderFromDisk(swh_storage, url=repo_url)
+
+ assert loader.load() == {"status": "eventful"}
+ stats = get_stats(loader.storage)
+ expected_stats = {
+ "content": 2,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 58,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+ assert stats == expected_stats
+
+ visit_status = assert_last_visit_matches(
+ loader.storage, repo_url, status="full", type="hg",
+ )
+
+ # Make a fork of the first repository we ingested
+ fork_url = prepare_repository_from_archive(
+ archive_path, "the-sandbox-reloaded", tmp_path
+ )
+ loader2 = HgLoaderFromDisk(
+ swh_storage, url=fork_url, directory=str(tmp_path / archive_name)
+ )
+
+ assert loader2.load() == {"status": "uneventful"}
+
+ stats = get_stats(loader.storage)
+ expected_stats2 = expected_stats.copy()
+ expected_stats2.update(
+ {"origin": 1 + 1, "origin_visit": 1 + 1,}
+ )
+ assert stats == expected_stats2
+
+ visit_status2 = assert_last_visit_matches(
+ loader.storage, fork_url, status="full", type="hg",
+ )
+
+ assert visit_status.snapshot is not None
+ assert visit_status2.snapshot is None
+
+ # FIXME: Consistent behavior with filtering data out from already seen snapshot (on
+ # a given origin). But, the other fork origin has no snapshot at all. We should
+ # though, shouldn't we? Otherwise, that would mean fork could end up with no
+ # snapshot at all.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 2:20 PM (2 d, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222971
Attached To
D6240: Use extids to filter out already seen revisions across hg origins
Event Timeline
Log In to Comment