diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 3.0.0 +swh.loader.core >= 3.2.0 swh.model >= 4.3.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -115,7 +115,6 @@ self, storage: StorageInterface, url: str, - base_url: Optional[str] = None, incremental: bool = True, repo_representation: Type[RepoRepresentation] = RepoRepresentation, pack_size_bytes: int = 4 * 1024 * 1024 * 1024, @@ -135,7 +134,6 @@ """ super().__init__(storage=storage, origin_url=url, **kwargs) - self.base_url = base_url self.incremental = incremental self.repo_representation = repo_representation self.pack_size_bytes = pack_size_bytes @@ -238,10 +236,14 @@ if self.incremental: prev_snapshot = self.get_full_snapshot(self.origin.url) - if self.base_url and prev_snapshot is None: - base_origin = list(self.storage.origin_get([self.base_url]))[0] - if base_origin: - prev_snapshot = self.get_full_snapshot(base_origin.url) + if self.parent_origins is not None: + # If this is the first time we load this origin and it is a forge + # fork, load incrementally from one of the origins it was forked from, + # closest parent first + for parent_origin in self.parent_origins: + if prev_snapshot is not None: + break + prev_snapshot = self.get_full_snapshot(parent_origin.url) if prev_snapshot is not None: self.base_snapshot = prev_snapshot @@ -509,14 +511,13 @@ help="Ignore the repository history", default=False, ) - def main(origin_url: str, base_url: str, incremental: bool) -> Dict[str, Any]: + def main(origin_url: str, incremental: bool) -> Dict[str, Any]: from swh.storage import get_storage storage = get_storage(cls="memory") loader = GitLoader( storage, origin_url, - base_url=base_url, incremental=incremental, ) return loader.load() diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -9,6 +9,7 @@ import subprocess from tempfile import SpooledTemporaryFile from threading import Thread +from unittest.mock import MagicMock, call from dulwich.errors import GitProtocolError, NotGitRepository, ObjectFormatException from dulwich.porcelain import push @@ -23,6 +24,7 @@ get_stats, prepare_repository_from_archive, ) +from swh.model.model import Origin class CommonGitLoaderNotFound: @@ -111,13 +113,16 @@ class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound): - """Mostly the same loading scenario but with a base-url different than the repo-url. + """Mostly the same loading scenario but with a ``parent_origin`` different from the + ``origin``; as if the ``origin`` was a forge-fork of ``parent_origin``, detected + by the metadata loader. + To walk slightly different paths, the end result should stay the same. """ @pytest.fixture(autouse=True) - def init(self, swh_storage, datadir, tmp_path): + def init(self, swh_storage, datadir, tmp_path, mocker): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) @@ -125,10 +130,83 @@ archive_path, archive_name, tmp_path=tmp_path ) self.destination_path = os.path.join(tmp_path, archive_name) - base_url = f"base://{self.repo_url}" - self.loader = GitLoader(swh_storage, self.repo_url, base_url=base_url) + + self.fetcher = MagicMock() + self.fetcher.get_origin_metadata.return_value = [] + self.fetcher.get_parent_origins.return_value = [ + Origin(url=f"base://{self.repo_url}") + ] + self.fetcher_cls = MagicMock(return_value=self.fetcher) + self.fetcher_cls.SUPPORTED_LISTERS = ["fake-lister"] + mocker.patch( + "swh.loader.core.metadata_fetchers._fetchers", + return_value=[self.fetcher_cls], + ) + + self.loader = GitLoader( + MagicMock(wraps=swh_storage), + self.repo_url, + lister_name="fake-lister", + lister_instance_name="", + ) self.repo = dulwich.repo.Repo(self.destination_path) + def test_load_incremental(self): + res = self.loader.load() + assert res == {"status": "eventful"} + + self.fetcher_cls.assert_called_once_with( + credentials={}, + lister_name="fake-lister", + lister_instance_name="", + origin=Origin(url=self.repo_url), + ) + self.fetcher.get_parent_origins.assert_called_once_with() + + # First tries the same origin + assert self.loader.storage.origin_visit_get_latest.mock_calls == [ + call( + self.repo_url, + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + # As it does not already have a snapshot, fall back to the parent origin + call( + f"base://{self.repo_url}", + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + ] + + self.fetcher.reset_mock() + self.fetcher_cls.reset_mock() + self.loader.storage.reset_mock() + + # Load again + res = self.loader.load() + assert res == {"status": "uneventful"} + + self.fetcher_cls.assert_called_once_with( + credentials={}, + lister_name="fake-lister", + lister_instance_name="", + origin=Origin(url=self.repo_url), + ) + self.fetcher.get_parent_origins.assert_not_called() + + assert self.loader.storage.origin_visit_get_latest.mock_calls == [ + # Tries the same origin, and finds a snapshot + call( + self.repo_url, + allowed_statuses=None, + require_snapshot=True, + type=None, + ), + # -> does not need to fall back to the parent + ] + class DumbGitLoaderTestBase(FullGitLoaderTests): """Prepare a git repository to be loaded using the HTTP dumb transfer protocol."""