diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -8,7 +8,7 @@ import logging from os import path import string -from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union +from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple, Union import attr import iso8601 @@ -84,6 +84,7 @@ artifacts: Sequence[Dict[str, Any]], extid_manifest_format: Optional[str] = None, max_content_size: Optional[int] = None, + snapshot_append: bool = False, ): f"""Loader constructor. @@ -107,6 +108,8 @@ extid_manifest_format: template string used to format a manifest, which is hashed to get the extid of a package. Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r} + snapshot_append: if :const:`True`, append latest snapshot content to + the new snapshot created by the loader """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) @@ -116,6 +119,7 @@ if extid_manifest_format is None else string.Template(extid_manifest_format) ) + self.snapshot_append = snapshot_append def get_versions(self) -> Sequence[str]: versions = [] @@ -164,3 +168,9 @@ directory=directory, synthetic=True, ) + + def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: + if not self.snapshot_append: + return {} + last_snapshot = self.last_snapshot() + return last_snapshot.to_dict()["branches"] if last_snapshot else {} diff --git a/swh/loader/package/archive/tasks.py b/swh/loader/package/archive/tasks.py --- a/swh/loader/package/archive/tasks.py +++ b/swh/loader/package/archive/tasks.py @@ -9,7 +9,9 @@ @shared_task(name=__name__ + ".LoadArchive") -def load_archive_files(*, url=None, artifacts=None): +def load_archive_files(*, url=None, artifacts=None, snapshot_append=False): """Load archive's artifacts (e.g gnu, etc...)""" - loader = ArchiveLoader.from_configfile(url=url, artifacts=artifacts) + loader = ArchiveLoader.from_configfile( + url=url, artifacts=artifacts, snapshot_append=snapshot_append + ) return loader.load() diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -22,7 +22,14 @@ "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", - } + }, + { + "time": 1480991830, + "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", + "length": 238466, + "filename": "8sync-0.2.0.tar.gz", + "version": "0.2.0", + }, ] _expected_new_contents_first_visit = [ @@ -115,7 +122,7 @@ """With no prior visit, load a gnu project ends up with 1 snapshot """ - loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS) + loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" @@ -173,7 +180,7 @@ """ url = URL - loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS) + loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" @@ -229,13 +236,7 @@ ] assert len(urls) == 1 - artifact2 = { - "time": 1480991830, - "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", - "length": 238466, - "filename": "8sync-0.2.0.tar.gz", - "version": "0.2.0", - } + artifact2 = GNU_ARTIFACTS[1] loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2]) stats2 = get_stats(swh_storage) @@ -341,3 +342,77 @@ with pytest.raises(KeyError): p_info.extid(manifest_format=string.Template("$a $unknown_key")) + + +def test_archive_snapshot_append(swh_storage, requests_mock_datadir): + # first loading with a first artifact + artifact1 = GNU_ARTIFACTS[0] + loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") + + # check expected snapshot + snapshot = loader.last_snapshot() + assert len(snapshot.branches) == 2 + branch_artifact1_name = f"releases/{artifact1['version']}".encode() + assert b"HEAD" in snapshot.branches + assert branch_artifact1_name in snapshot.branches + assert snapshot.branches[b"HEAD"].target == branch_artifact1_name + + # second loading with a second artifact + artifact2 = GNU_ARTIFACTS[1] + loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") + + # check expected snapshot, should contain a new branch and the + # branch for the first artifact + snapshot = loader.last_snapshot() + assert len(snapshot.branches) == 3 + branch_artifact2_name = f"releases/{artifact2['version']}".encode() + assert b"HEAD" in snapshot.branches + assert branch_artifact2_name in snapshot.branches + assert branch_artifact1_name in snapshot.branches + assert snapshot.branches[b"HEAD"].target == branch_artifact2_name + + +def test_archive_snapshot_append_branch_override(swh_storage, requests_mock_datadir): + # first loading for a first artifact + artifact1 = GNU_ARTIFACTS[0] + loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") + + # check expected snapshot + snapshot = loader.last_snapshot() + assert len(snapshot.branches) == 2 + branch_artifact1_name = f"releases/{artifact1['version']}".encode() + assert branch_artifact1_name in snapshot.branches + branch_target_first_visit = snapshot.branches[branch_artifact1_name].target + + # second loading for a second artifact with same version as the first one + # but with different tarball content + artifact2 = dict(GNU_ARTIFACTS[0]) + artifact2["url"] = GNU_ARTIFACTS[1]["url"] + artifact2["time"] = GNU_ARTIFACTS[1]["time"] + artifact2["length"] = GNU_ARTIFACTS[1]["length"] + loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") + + # check expected snapshot, should contain the same branch as previously + # but with different target + snapshot = loader.last_snapshot() + assert len(snapshot.branches) == 2 + assert branch_artifact1_name in snapshot.branches + branch_target_second_visit = snapshot.branches[branch_artifact1_name].target + + assert branch_target_first_visit != branch_target_second_visit diff --git a/swh/loader/package/archive/tests/test_tasks.py b/swh/loader/package/archive/tests/test_tasks.py --- a/swh/loader/package/archive/tests/test_tasks.py +++ b/swh/loader/package/archive/tests/test_tasks.py @@ -19,3 +19,20 @@ assert res.successful() assert mock_load.called assert res.result == {"status": "eventful"} + + +def test_tasks_archive_loader_snapshot_append( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.archive.loader.ArchiveLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.archive.tasks.LoadArchive", + kwargs=dict(url="https://gnu.org/", artifacts=[], snapshot_append=True), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"}