Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/tests/test_loader.py
# Copyright (C) 2018-2020 The Software Heritage developers | # Copyright (C) 2018-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import copy | import copy | ||||
import logging | import logging | ||||
import os | import os | ||||
import time | import time | ||||
import hglib | import hglib | ||||
from hglib.error import CommandError | from hglib.error import CommandError | ||||
import pytest | import pytest | ||||
from swh.loader.mercurial.utils import parse_visit_date | |||||
from swh.loader.tests import ( | from swh.loader.tests import ( | ||||
assert_last_visit_matches, | assert_last_visit_matches, | ||||
check_snapshot, | check_snapshot, | ||||
get_stats, | get_stats, | ||||
prepare_repository_from_archive, | prepare_repository_from_archive, | ||||
) | ) | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType | from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType | ||||
from swh.storage.algos.snapshot import snapshot_get_latest | from swh.storage.algos.snapshot import snapshot_get_latest | ||||
from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader | from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader | ||||
VISIT_DATE = parse_visit_date("2016-05-03 15:16:32+00") | |||||
assert VISIT_DATE is not None | |||||
def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): | |||||
def test_loader_hg_new_visit_no_release(swh_storage, datadir, tmp_path): | |||||
"""Eventful visit should yield 1 snapshot""" | """Eventful visit should yield 1 snapshot""" | ||||
archive_name = "the-sandbox" | archive_name = "the-sandbox" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | ||||
loader = HgBundle20Loader(repo_url) | loader = HgBundle20Loader(swh_storage, repo_url) | ||||
assert loader.load() == {"status": "eventful"} | assert loader.load() == {"status": "eventful"} | ||||
tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" | tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c" | ||||
tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" | tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56" | ||||
expected_snapshot = Snapshot( | expected_snapshot = Snapshot( | ||||
id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), | id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), | ||||
branches={ | branches={ | ||||
b"develop": SnapshotBranch( | b"develop": SnapshotBranch( | ||||
target=hash_to_bytes(tip_revision_develop), | target=hash_to_bytes(tip_revision_develop), | ||||
target_type=TargetType.REVISION, | target_type=TargetType.REVISION, | ||||
), | ), | ||||
b"default": SnapshotBranch( | b"default": SnapshotBranch( | ||||
target=hash_to_bytes(tip_revision_default), | target=hash_to_bytes(tip_revision_default), | ||||
target_type=TargetType.REVISION, | target_type=TargetType.REVISION, | ||||
), | ), | ||||
b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), | b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,), | ||||
}, | }, | ||||
) | ) | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, | swh_storage, | ||||
repo_url, | repo_url, | ||||
status="full", | status="full", | ||||
type="hg", | type="hg", | ||||
snapshot=expected_snapshot.id, | snapshot=expected_snapshot.id, | ||||
) | ) | ||||
check_snapshot(expected_snapshot, loader.storage) | check_snapshot(expected_snapshot, swh_storage) | ||||
stats = get_stats(loader.storage) | stats = get_stats(swh_storage) | ||||
assert stats == { | assert stats == { | ||||
"content": 2, | "content": 2, | ||||
"directory": 3, | "directory": 3, | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1, | "origin_visit": 1, | ||||
"release": 0, | "release": 0, | ||||
"revision": 58, | "revision": 58, | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
"snapshot": 1, | "snapshot": 1, | ||||
} | } | ||||
# Ensure archive loader yields the same snapshot | # Ensure archive loader yields the same snapshot | ||||
loader2 = HgArchiveBundle20Loader( | loader2 = HgArchiveBundle20Loader( | ||||
url=archive_path, | swh_storage, url=archive_path, archive_path=archive_path, visit_date=VISIT_DATE, | ||||
archive_path=archive_path, | |||||
visit_date="2016-05-03 15:16:32+00", | |||||
) | ) | ||||
actual_load_status = loader2.load() | actual_load_status = loader2.load() | ||||
assert actual_load_status == {"status": "eventful"} | assert actual_load_status == {"status": "eventful"} | ||||
stats2 = get_stats(loader2.storage) | stats2 = get_stats(loader2.storage) | ||||
expected_stats = copy.deepcopy(stats) | expected_stats = copy.deepcopy(stats) | ||||
expected_stats["origin"] += 1 | expected_stats["origin"] += 1 | ||||
expected_stats["origin_visit"] += 1 | expected_stats["origin_visit"] += 1 | ||||
assert stats2 == expected_stats | assert stats2 == expected_stats | ||||
# That visit yields the same snapshot | # That visit yields the same snapshot | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader2.storage, | loader2.storage, | ||||
archive_path, | archive_path, | ||||
status="full", | status="full", | ||||
type="hg", | type="hg", | ||||
snapshot=expected_snapshot.id, | snapshot=expected_snapshot.id, | ||||
) | ) | ||||
def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): | def test_loader_hg_new_visit_with_release(swh_storage, datadir, tmp_path): | ||||
"""Eventful visit with release should yield 1 snapshot""" | """Eventful visit with release should yield 1 snapshot""" | ||||
archive_name = "hello" | archive_name = "hello" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | ||||
loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) | loader = HgBundle20Loader(swh_storage, url=repo_url, visit_date=VISIT_DATE,) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status == {"status": "eventful"} | assert actual_load_status == {"status": "eventful"} | ||||
# then | # then | ||||
stats = get_stats(loader.storage) | stats = get_stats(swh_storage) | ||||
assert stats == { | assert stats == { | ||||
"content": 3, | "content": 3, | ||||
"directory": 3, | "directory": 3, | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1, | "origin_visit": 1, | ||||
"release": 1, | "release": 1, | ||||
"revision": 3, | "revision": 3, | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
"snapshot": 1, | "snapshot": 1, | ||||
} | } | ||||
# cf. test_loader.org for explaining from where those hashes | # cf. test_loader.org for explaining from where those hashes | ||||
tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") | tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") | ||||
release = loader.storage.release_get([tip_release])[0] | release = swh_storage.release_get([tip_release])[0] | ||||
assert release is not None | assert release is not None | ||||
tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") | tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") | ||||
revision = loader.storage.revision_get([tip_revision_default])[0] | revision = swh_storage.revision_get([tip_revision_default])[0] | ||||
assert revision is not None | assert revision is not None | ||||
expected_snapshot = Snapshot( | expected_snapshot = Snapshot( | ||||
id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), | id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"), | ||||
branches={ | branches={ | ||||
b"default": SnapshotBranch( | b"default": SnapshotBranch( | ||||
target=tip_revision_default, target_type=TargetType.REVISION, | target=tip_revision_default, target_type=TargetType.REVISION, | ||||
), | ), | ||||
b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), | b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,), | ||||
b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), | b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,), | ||||
}, | }, | ||||
) | ) | ||||
check_snapshot(expected_snapshot, loader.storage) | check_snapshot(expected_snapshot, swh_storage) | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, | swh_storage, | ||||
repo_url, | repo_url, | ||||
type=RevisionType.MERCURIAL.value, | type=RevisionType.MERCURIAL.value, | ||||
status="full", | status="full", | ||||
snapshot=expected_snapshot.id, | snapshot=expected_snapshot.id, | ||||
) | ) | ||||
# Ensure archive loader yields the same snapshot | # Ensure archive loader yields the same snapshot | ||||
loader2 = HgArchiveBundle20Loader( | loader2 = HgArchiveBundle20Loader( | ||||
url=archive_path, | swh_storage, url=archive_path, archive_path=archive_path, visit_date=VISIT_DATE, | ||||
archive_path=archive_path, | |||||
visit_date="2016-05-03 15:16:32+00", | |||||
) | ) | ||||
actual_load_status = loader2.load() | actual_load_status = loader2.load() | ||||
assert actual_load_status == {"status": "eventful"} | assert actual_load_status == {"status": "eventful"} | ||||
stats2 = get_stats(loader2.storage) | stats2 = get_stats(loader2.storage) | ||||
expected_stats = copy.deepcopy(stats) | expected_stats = copy.deepcopy(stats) | ||||
expected_stats["origin"] += 1 | expected_stats["origin"] += 1 | ||||
expected_stats["origin_visit"] += 1 | expected_stats["origin_visit"] += 1 | ||||
assert stats2 == expected_stats | assert stats2 == expected_stats | ||||
# That visit yields the same snapshot | # That visit yields the same snapshot | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader2.storage, | loader2.storage, | ||||
archive_path, | archive_path, | ||||
status="full", | status="full", | ||||
type="hg", | type="hg", | ||||
snapshot=expected_snapshot.id, | snapshot=expected_snapshot.id, | ||||
) | ) | ||||
def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): | def test_visit_with_archive_decompression_failure(swh_storage, mocker, datadir): | ||||
"""Failure to decompress should fail early, no data is ingested""" | """Failure to decompress should fail early, no data is ingested""" | ||||
mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") | mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") | ||||
mock_patoo.side_effect = ValueError | mock_patoo.side_effect = ValueError | ||||
archive_name = "hello" | archive_name = "hello" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
loader = HgArchiveBundle20Loader( | loader = HgArchiveBundle20Loader( | ||||
url=archive_path, visit_date="2016-05-03 15:16:32+00", | swh_storage, url=archive_path, visit_date=VISIT_DATE, | ||||
) | ) | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status == {"status": "failed"} | assert actual_load_status == {"status": "failed"} | ||||
stats = get_stats(loader.storage) | stats = get_stats(swh_storage) | ||||
assert stats == { | assert stats == { | ||||
"content": 0, | "content": 0, | ||||
"directory": 0, | "directory": 0, | ||||
"origin": 1, | "origin": 1, | ||||
"origin_visit": 1, | "origin_visit": 1, | ||||
"release": 0, | "release": 0, | ||||
"revision": 0, | "revision": 0, | ||||
"skipped_content": 0, | "skipped_content": 0, | ||||
"snapshot": 0, | "snapshot": 0, | ||||
} | } | ||||
# That visit yields the same snapshot | # That visit yields the same snapshot | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, archive_path, status="failed", type="hg", snapshot=None | swh_storage, archive_path, status="failed", type="hg", snapshot=None | ||||
) | ) | ||||
def test_visit_error_with_snapshot_partial(swh_config, datadir, tmp_path, mocker): | def test_visit_error_with_snapshot_partial(swh_storage, datadir, tmp_path, mocker): | ||||
"""Incomplete ingestion leads to a 'partial' ingestion status""" | """Incomplete ingestion leads to a 'partial' ingestion status""" | ||||
mock = mocker.patch("swh.loader.mercurial.loader.HgBundle20Loader.store_metadata") | mock = mocker.patch("swh.loader.mercurial.loader.HgBundle20Loader.store_metadata") | ||||
mock.side_effect = ValueError | mock.side_effect = ValueError | ||||
archive_name = "the-sandbox" | archive_name = "the-sandbox" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | ||||
loader = HgBundle20Loader(repo_url) | loader = HgBundle20Loader(swh_storage, repo_url) | ||||
assert loader.load() == {"status": "failed"} | assert loader.load() == {"status": "failed"} | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, | swh_storage, | ||||
repo_url, | repo_url, | ||||
status="partial", | status="partial", | ||||
type="hg", | type="hg", | ||||
snapshot=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), | snapshot=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"), | ||||
) | ) | ||||
@pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||
"error_msg", | "error_msg", | ||||
[ | [ | ||||
b"does not appear to be an HG repository", | b"does not appear to be an HG repository", | ||||
b"404: Not Found", | b"404: Not Found", | ||||
b"404: NOT FOUND", | b"404: NOT FOUND", | ||||
b"Name or service not known", | b"Name or service not known", | ||||
], | ], | ||||
) | ) | ||||
def test_visit_error_with_status_not_found( | def test_visit_error_with_status_not_found( | ||||
swh_config, datadir, tmp_path, mocker, error_msg | swh_storage, datadir, tmp_path, mocker, error_msg | ||||
): | ): | ||||
"""Not reaching the repo leads to a 'not_found' ingestion status""" | """Not reaching the repo leads to a 'not_found' ingestion status""" | ||||
mock = mocker.patch("hglib.clone") | mock = mocker.patch("hglib.clone") | ||||
mock.side_effect = CommandError((), 255, b"", error_msg) | mock.side_effect = CommandError((), 255, b"", error_msg) | ||||
archive_name = "the-sandbox" | archive_name = "the-sandbox" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | ||||
loader = HgBundle20Loader(repo_url) | loader = HgBundle20Loader(swh_storage, repo_url) | ||||
assert loader.load() == {"status": "uneventful"} | assert loader.load() == {"status": "uneventful"} | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, repo_url, status="not_found", type="hg", snapshot=None, | swh_storage, repo_url, status="not_found", type="hg", snapshot=None, | ||||
) | ) | ||||
def test_visit_error_with_clone_error(swh_config, datadir, tmp_path, mocker): | def test_visit_error_with_clone_error(swh_storage, datadir, tmp_path, mocker): | ||||
"""Testing failures other than 'not_found'""" | """Testing failures other than 'not_found'""" | ||||
mock = mocker.patch("hglib.clone") | mock = mocker.patch("hglib.clone") | ||||
mock.side_effect = CommandError((), 255, b"", b"out of disk space") | mock.side_effect = CommandError((), 255, b"", b"out of disk space") | ||||
archive_name = "the-sandbox" | archive_name = "the-sandbox" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | ||||
loader = HgBundle20Loader(repo_url) | loader = HgBundle20Loader(swh_storage, repo_url) | ||||
assert loader.load() == {"status": "failed"} | assert loader.load() == {"status": "failed"} | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, repo_url, status="failed", type="hg", snapshot=None, | swh_storage, repo_url, status="failed", type="hg", snapshot=None, | ||||
) | ) | ||||
def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): | def test_visit_repository_with_transplant_operations(swh_storage, datadir, tmp_path): | ||||
"""Visit a mercurial repository visit transplant operations within should yield a | """Visit a mercurial repository visit transplant operations within should yield a | ||||
snapshot as well. | snapshot as well. | ||||
""" | """ | ||||
archive_name = "transplant" | archive_name = "transplant" | ||||
archive_path = os.path.join(datadir, f"{archive_name}.tgz") | archive_path = os.path.join(datadir, f"{archive_name}.tgz") | ||||
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) | ||||
loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) | loader = HgBundle20Loader(swh_storage, url=repo_url, visit_date=VISIT_DATE,) | ||||
# load hg repository | # load hg repository | ||||
actual_load_status = loader.load() | actual_load_status = loader.load() | ||||
assert actual_load_status == {"status": "eventful"} | assert actual_load_status == {"status": "eventful"} | ||||
# collect swh revisions | # collect swh revisions | ||||
assert_last_visit_matches( | assert_last_visit_matches( | ||||
loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" | swh_storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" | ||||
) | ) | ||||
revisions = [] | revisions = [] | ||||
snapshot = snapshot_get_latest(loader.storage, repo_url) | snapshot = snapshot_get_latest(swh_storage, repo_url) | ||||
for branch in snapshot.branches.values(): | for branch in snapshot.branches.values(): | ||||
if branch.target_type.value != "revision": | if branch.target_type.value != "revision": | ||||
continue | continue | ||||
revisions.append(branch.target) | revisions.append(branch.target) | ||||
# extract original changesets info and the transplant sources | # extract original changesets info and the transplant sources | ||||
hg_changesets = set() | hg_changesets = set() | ||||
transplant_sources = set() | transplant_sources = set() | ||||
for rev in loader.storage.revision_log(revisions): | for rev in swh_storage.revision_log(revisions): | ||||
hg_changesets.add(rev["metadata"]["node"]) | hg_changesets.add(rev["metadata"]["node"]) | ||||
for k, v in rev["extra_headers"]: | for k, v in rev["extra_headers"]: | ||||
if k == b"transplant_source": | if k == b"transplant_source": | ||||
transplant_sources.add(v.decode("ascii")) | transplant_sources.add(v.decode("ascii")) | ||||
# check extracted data are valid | # check extracted data are valid | ||||
assert len(hg_changesets) > 0 | assert len(hg_changesets) > 0 | ||||
assert len(transplant_sources) > 0 | assert len(transplant_sources) > 0 | ||||
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines |