diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -44,12 +44,15 @@ """Repository representation for a Software Heritage origin.""" def __init__( - self, storage, base_snapshot: Optional[Snapshot] = None, ignore_history=False + self, + storage, + base_snapshot: Optional[Snapshot] = None, + incremental: bool = True, ): self.storage = storage - self.ignore_history = ignore_history + self.incremental = incremental - if base_snapshot and not ignore_history: + if base_snapshot and incremental: self.base_snapshot: Snapshot = base_snapshot else: self.base_snapshot = Snapshot(branches={}) @@ -114,7 +117,7 @@ storage: StorageInterface, url: str, base_url: Optional[str] = None, - ignore_history: bool = False, + incremental: bool = True, repo_representation: Type[RepoRepresentation] = RepoRepresentation, pack_size_bytes: int = 4 * 1024 * 1024 * 1024, temp_file_cutoff: int = 100 * 1024 * 1024, @@ -127,6 +130,10 @@ repo_representation: swh's repository representation which is in charge of filtering between known and remote data. + ... + + incremental: If True, the default, this starts from the last known snapshot + (if any) references. Otherwise, this loads the full repository. """ super().__init__( @@ -136,7 +143,7 @@ ) self.origin_url = url self.base_url = base_url - self.ignore_history = ignore_history + self.incremental = incremental self.repo_representation = repo_representation self.pack_size_bytes = pack_size_bytes self.temp_file_cutoff = temp_file_cutoff @@ -235,7 +242,7 @@ prev_snapshot: Optional[Snapshot] = None - if not self.ignore_history: + if self.incremental: prev_snapshot = self.get_full_snapshot(self.origin.url) if self.base_url and prev_snapshot is None: @@ -254,7 +261,7 @@ base_repo = self.repo_representation( storage=self.storage, base_snapshot=self.base_snapshot, - ignore_history=self.ignore_history, + incremental=self.incremental, ) def do_progress(msg: bytes) -> None: @@ -508,12 +515,12 @@ help="Ignore the repository history", default=False, ) - def main(origin_url: str, base_url: str, ignore_history: bool) -> Dict[str, Any]: + def main(origin_url: str, base_url: str, incremental: bool) -> Dict[str, Any]: from swh.storage import get_storage storage = get_storage(cls="memory") loader = GitLoader( - storage, origin_url, base_url=base_url, ignore_history=ignore_history, + storage, origin_url, base_url=base_url, incremental=incremental, ) return loader.load() diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -151,7 +151,13 @@ } res = self.loader.load() - assert res == {"status": "uneventful"} + if hasattr(self.loader, "incremental"): # some loaders do not have such flag + assert (self.loader.incremental and res == {"status": "uneventful"}) or ( + not self.loader.incremental and res == {"status": "eventful"} + ) + else: + assert res == {"status": "uneventful"} + stats1 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats0) expected_stats["origin_visit"] += 1 diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -103,10 +103,29 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) self.repo_url = prepare_repository_from_archive( - archive_path, archive_name, tmp_path=tmp_path + archive_path, archive_name, tmp_path=tmp_path, + ) + self.destination_path = os.path.join(tmp_path, archive_name) + self.loader = GitLoader(swh_storage, self.repo_url, incremental=True) + self.repo = dulwich.repo.Repo(self.destination_path) + + +class TestGitLoaderIncremental(FullGitLoaderTests, CommonGitLoaderNotFound): + """Prepare a git directory repository to be loaded through a GitLoader. + This tests all git loader scenario with a non-incremental git loader. + + """ + + @pytest.fixture(autouse=True) + def init(self, swh_storage, datadir, tmp_path): + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = str(tmp_path) + self.repo_url = prepare_repository_from_archive( + archive_path, archive_name, tmp_path=tmp_path, ) self.destination_path = os.path.join(tmp_path, archive_name) - self.loader = GitLoader(swh_storage, self.repo_url) + self.loader = GitLoader(swh_storage, self.repo_url, incremental=False) self.repo = dulwich.repo.Repo(self.destination_path)