Page MenuHomeSoftware Heritage

D7831.diff
No OneTemporary

D7831.diff

diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py
--- a/swh/loader/git/loader.py
+++ b/swh/loader/git/loader.py
@@ -46,7 +46,7 @@
def __init__(
self,
storage,
- base_snapshot: Optional[Snapshot] = None,
+ base_snapshots: List[Snapshot] = None,
incremental: bool = True,
statsd: Statsd = None,
):
@@ -54,10 +54,10 @@
self.incremental = incremental
self.statsd = statsd
- if base_snapshot and incremental:
- self.base_snapshot: Snapshot = base_snapshot
+ if base_snapshots and incremental:
+ self.base_snapshots: List[Snapshot] = base_snapshots
else:
- self.base_snapshot = Snapshot(branches={})
+ self.base_snapshots = []
self.heads: Set[HexBytes] = set()
@@ -80,10 +80,11 @@
# Cache existing heads
local_heads: Set[HexBytes] = set()
- for branch_name, branch in self.base_snapshot.branches.items():
- if not branch or branch.target_type == TargetType.ALIAS:
- continue
- local_heads.add(hashutil.hash_to_hex(branch.target).encode())
+ for base_snapshot in self.base_snapshots:
+ for branch_name, branch in base_snapshot.branches.items():
+ if not branch or branch.target_type == TargetType.ALIAS:
+ continue
+ local_heads.add(hashutil.hash_to_hex(branch.target).encode())
self.heads = local_heads
@@ -266,48 +267,44 @@
def prepare(self) -> None:
assert self.origin is not None
- prev_snapshot: Optional[Snapshot] = None
+ self.prev_snapshot = Snapshot(branches={})
+ """Last snapshot of this origin if any; empty snapshot otherwise"""
+ self.base_snapshots = []
+ """Last snapshot of this origin and all its parents, if any."""
self.statsd.constant_tags["incremental_enabled"] = self.incremental
self.statsd.constant_tags["has_parent_origins"] = bool(self.parent_origins)
+
+ # May be set to True later
+ self.statsd.constant_tags["has_parent_snapshot"] = False
+
if self.incremental:
prev_snapshot = self.get_full_snapshot(self.origin.url)
+ self.statsd.constant_tags["has_previous_snapshot"] = bool(prev_snapshot)
if prev_snapshot:
- incremental_snapshot_origin = "self"
+ self.prev_snapshot = prev_snapshot
+ self.base_snapshots.append(prev_snapshot)
- elif self.parent_origins is not None:
+ if self.parent_origins is not None:
# If this is the first time we load this origin and it is a forge
# fork, load incrementally from one of the origins it was forked from,
# closest parent first
for parent_origin in self.parent_origins:
- prev_snapshot = self.get_full_snapshot(parent_origin.url)
- if prev_snapshot is not None:
- incremental_snapshot_origin = "parent"
- break
- else:
- incremental_snapshot_origin = "none"
- else:
- incremental_snapshot_origin = "none"
-
- self.statsd.constant_tags[
- "incremental_snapshot_origin"
- ] = incremental_snapshot_origin
+ parent_snapshot = self.get_full_snapshot(parent_origin.url)
+ if parent_snapshot is not None:
+ self.statsd.constant_tags["has_parent_snapshot"] = True
+ self.base_snapshots.append(parent_snapshot)
# Increments a metric with full name 'swh_loader_git'; which is useful to
# count how many runs of the loader are with each incremental mode
self.statsd.increment("git_total", tags={})
- if prev_snapshot is not None:
- self.base_snapshot = prev_snapshot
- else:
- self.base_snapshot = Snapshot(branches={})
-
def fetch_data(self) -> bool:
assert self.origin is not None
base_repo = self.repo_representation(
storage=self.storage,
- base_snapshot=self.base_snapshot,
+ base_snapshots=self.base_snapshots,
incremental=self.incremental,
statsd=self.statsd,
)
@@ -502,7 +499,7 @@
unknown_objects = {}
base_snapshot_reverse_branches = {
branch.target: branch
- for branch in self.base_snapshot.branches.values()
+ for branch in self.prev_snapshot.branches.values()
if branch and branch.target_type != TargetType.ALIAS
}
@@ -538,8 +535,8 @@
the one we retrieved at the beginning of the run"""
eventful = False
- if self.base_snapshot and self.snapshot:
- eventful = self.snapshot.id != self.base_snapshot.id
+ if self.prev_snapshot and self.snapshot:
+ eventful = self.snapshot.id != self.prev_snapshot.id
elif self.snapshot:
eventful = bool(self.snapshot.branches)
diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py
--- a/swh/loader/git/tests/test_loader.py
+++ b/swh/loader/git/tests/test_loader.py
@@ -128,7 +128,8 @@
assert self.loader.statsd.constant_tags == {
"visit_type": "git",
"incremental_enabled": True,
- "incremental_snapshot_origin": "none",
+ "has_parent_snapshot": False,
+ "has_previous_snapshot": False,
"has_parent_origins": False,
}
@@ -212,7 +213,8 @@
assert self.loader.statsd.constant_tags == {
"visit_type": "git",
"incremental_enabled": True,
- "incremental_snapshot_origin": "none",
+ "has_parent_snapshot": False,
+ "has_previous_snapshot": False,
"has_parent_origins": True,
}
@@ -284,7 +286,8 @@
assert self.loader.statsd.constant_tags == {
"visit_type": "git",
"incremental_enabled": True,
- "incremental_snapshot_origin": "parent",
+ "has_parent_snapshot": True,
+ "has_previous_snapshot": False,
"has_parent_origins": True,
}
@@ -323,7 +326,14 @@
allowed_statuses=None,
require_snapshot=True,
),
- # -> does not need to fall back to the parent
+ # also fetches the parent, in case the origin was rebased on the parent
+ # since the last visit
+ call(
+ f"base://{self.repo_url}",
+ type=None,
+ allowed_statuses=None,
+ require_snapshot=True,
+ ),
]
# TODO: assert "incremental*" is added to constant tags before these
@@ -336,7 +346,8 @@
assert self.loader.statsd.constant_tags == {
"visit_type": "git",
"incremental_enabled": True,
- "incremental_snapshot_origin": "self",
+ "has_parent_snapshot": False, # Because we reset the mock since last time
+ "has_previous_snapshot": True,
"has_parent_origins": True,
}

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:10 PM (5 d, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217688

Event Timeline