Page MenuHomeSoftware Heritage

test_loader.py
No OneTemporary

test_loader.py

# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from pathlib import Path
from breezy.builtins import cmd_uncommit
import pytest
from swh.loader.bzr.loader import BazaarLoader, BzrDirectory
from swh.loader.tests import (
assert_last_visit_matches,
get_stats,
prepare_repository_from_archive,
)
from swh.model.from_disk import Content
from swh.model.hashutil import hash_to_bytes
from swh.storage.algos.snapshot import snapshot_get_latest
# Generated repositories:
# - needs-upgrade:
# - Repository needs upgrade
# - empty:
# - Empty repo
# - renames:
# - File rename
# - Directory renames
# - Directory renames *and* file rename conflicting
# - no-branch:
# - No branch
# - metadata-and-type-changes:
# - Directory removed
# - Kind changed (file to symlink, directory to file, etc.)
# - not changed_content and not renamed and not kind_changed (so, exec file?)
# - Executable file
# - Empty commit (bzr commit --unchanged)
# - ghosts
# - Ghost revisions
# - broken-tags
# - Tags corruption
# - does-not-support-tags
# - Repo is recent but branch does not support tags, needs upgraded
# TODO tests:
# - Root path listed in changes (does that even happen?)
# - Parent is :null (does that even happen?)
# - Case insensitive removal (Is it actually a problem?)
# - Truly corrupted revision?
# - No match from storage (wrong topo sort or broken rev)
def do_uncommit(repo_url):
"""Remove the latest revision from the given bzr repo"""
uncommit_cmd = cmd_uncommit()
with open(os.devnull, "w") as f:
uncommit_cmd.outf = f
uncommit_cmd.run(repo_url)
@pytest.mark.parametrize("do_clone", [False, True])
def test_nominal(swh_storage, datadir, tmp_path, do_clone):
archive_path = Path(datadir, "nominal.tgz")
repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
if do_clone:
# Check that the cloning mechanism works
loader = BazaarLoader(swh_storage, repo_url)
else:
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "eventful"}
assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr")
snapshot = snapshot_get_latest(swh_storage, repo_url)
expected_branches = [
b"HEAD",
b"tags/0.1",
b"tags/latest",
b"tags/other-tag",
b"trunk",
]
assert sorted(snapshot.branches.keys()) == expected_branches
stats = get_stats(swh_storage)
assert stats == {
"content": 7,
"directory": 7,
"origin": 1,
"origin_visit": 1,
"release": 3,
"revision": 6,
"skipped_content": 0,
"snapshot": 1,
}
# It contains associated bugs, making it a good complete candidate
example_revision = hash_to_bytes("18bb5b2c866c10c58a191afcd0b450a8727f1c62")
revision = loader.storage.revision_get([example_revision])[0]
assert revision.to_dict() == {
"message": b"fixing bugs",
"author": {
"fullname": b"Rapha\xc3\xabl Gom\xc3\xa8s <alphare@alphare-carbon.lan>",
"name": b"Rapha\xc3\xabl Gom\xc3\xa8s",
"email": b"alphare@alphare-carbon.lan",
},
"committer": {
"fullname": b"Rapha\xc3\xabl Gom\xc3\xa8s <alphare@alphare-carbon.lan>",
"name": b"Rapha\xc3\xabl Gom\xc3\xa8s",
"email": b"alphare@alphare-carbon.lan",
},
"date": {
"timestamp": {"seconds": 1643302390, "microseconds": 0},
"offset_bytes": b"+0100",
},
"committer_date": {
"timestamp": {"seconds": 1643302390, "microseconds": 0},
"offset_bytes": b"+0100",
},
"type": "bzr",
"directory": b"s0\xf3pe\xa3\x12\x05{\xc7\xbc\x86\xa6\x14.\xc1b\x1c\xeb\x05",
"synthetic": False,
"metadata": None,
"parents": (b"*V\xf5\n\xf0?\x1d{kE4\xda(\xb1\x08R\x83\x87-\xb6",),
"id": example_revision,
"extra_headers": (
(b"time_offset_seconds", b"3600"),
(b"bug", b"fixed https://launchpad.net/bugs/1234"),
(b"bug", b"fixed https://bz.example.com/?show_bug=4321"),
),
}
def test_needs_upgrade(swh_storage, datadir, tmp_path, mocker):
"""Old bzr repository format should be upgraded to latest format"""
archive_path = Path(datadir, "needs-upgrade.tgz")
repo_url = prepare_repository_from_archive(archive_path, "needs-upgrade", tmp_path)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
upgrade_spy = mocker.spy(loader, "run_upgrade")
res = loader.load()
upgrade_spy.assert_called()
assert res == {"status": "uneventful"} # needs-upgrade is an empty repo
def test_does_not_support_tags(swh_storage, datadir, tmp_path, mocker):
"""Repository format is correct, but the branch itself does not support tags
and should be upgraded to the latest format"""
archive_path = Path(datadir, "does-not-support-tags.tgz")
path = "does-not-support-tags-repo/does-not-support-tags-branch"
repo_url = prepare_repository_from_archive(
archive_path,
path,
tmp_path,
)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
upgrade_spy = mocker.spy(loader, "run_upgrade")
res = loader.load()
upgrade_spy.assert_called()
assert res == {"status": "uneventful"} # does-not-support-tags is an empty repo
def test_no_branch(swh_storage, datadir, tmp_path):
"""This should only happen with a broken clone, so the expected result is failure"""
archive_path = Path(datadir, "no-branch.tgz")
repo_url = prepare_repository_from_archive(archive_path, "no-branch", tmp_path)
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "failed"}
def test_empty(swh_storage, datadir, tmp_path):
"""An empty repository is fine, it's just got no information"""
archive_path = Path(datadir, "empty.tgz")
repo_url = prepare_repository_from_archive(archive_path, "empty", tmp_path)
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "uneventful"}
# Empty snapshot does not bother the incremental code
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "uneventful"}
def test_renames(swh_storage, datadir, tmp_path):
archive_path = Path(datadir, "renames.tgz")
repo_url = prepare_repository_from_archive(archive_path, "renames", tmp_path)
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "eventful"}
assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr")
snapshot = snapshot_get_latest(swh_storage, repo_url)
assert sorted(snapshot.branches.keys()) == [
b"HEAD",
b"trunk",
]
stats = get_stats(swh_storage)
assert stats == {
"content": 1,
"directory": 5,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 2,
"skipped_content": 0,
"snapshot": 1,
}
def test_broken_tags(swh_storage, datadir, tmp_path):
"""A tag pointing to a the null revision should not break anything"""
archive_path = Path(datadir, "broken-tags.tgz")
repo_url = prepare_repository_from_archive(archive_path, "broken-tags", tmp_path)
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "uneventful"}
assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr")
snapshot = snapshot_get_latest(swh_storage, repo_url)
assert sorted(snapshot.branches.keys()) == [
b"tags/null-tag", # broken tag does appear, but didn't cause any issues
]
stats = get_stats(swh_storage)
assert stats == {
"content": 0,
"directory": 0,
"origin": 1,
"origin_visit": 1,
"release": 0, # Does not count as a valid release
"revision": 0,
"skipped_content": 0,
"snapshot": 1,
}
def test_metadata_and_type_changes(swh_storage, datadir, tmp_path):
archive_path = Path(datadir, "metadata-and-type-changes.tgz")
repo_url = prepare_repository_from_archive(
archive_path, "metadata-and-type-changes", tmp_path
)
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "eventful"}
assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr")
snapshot = snapshot_get_latest(swh_storage, repo_url)
assert sorted(snapshot.branches.keys()) == [
b"HEAD",
b"trunk",
]
stats = get_stats(swh_storage)
assert stats == {
"content": 1,
"directory": 9,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 7,
"skipped_content": 0,
"snapshot": 1,
}
def test_ghosts(swh_storage, datadir, tmp_path):
archive_path = Path(datadir, "ghosts.tgz")
repo_url = prepare_repository_from_archive(archive_path, "ghosts", tmp_path)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
assert loader._ghosts == set()
res = loader.load()
assert loader._ghosts == set((b"iamaghostboo",))
assert res == {"status": "eventful"}
assert_last_visit_matches(swh_storage, repo_url, status="full", type="bzr")
snapshot = snapshot_get_latest(swh_storage, repo_url)
assert sorted(snapshot.branches.keys()) == [
b"HEAD",
b"tags/brokentag", # tag pointing to a ghost revision is tracked
b"trunk",
]
stats = get_stats(swh_storage)
assert stats == {
"content": 0, # No contents
"directory": 1, # Root directory always counts
"origin": 1,
"origin_visit": 1,
"release": 0, # Ghost tag is ignored, stored as dangling
"revision": 1, # Only one revision, the ghost is ignored
"skipped_content": 0,
"snapshot": 1,
}
def test_bzr_directory():
directory = BzrDirectory()
directory[b"a/decently/enough/nested/path"] = Content(b"whatever")
directory[b"a/decently/other_node"] = Content(b"whatever else")
directory[b"another_node"] = Content(b"contents")
assert directory[b"a/decently/enough/nested/path"] == Content(b"whatever")
assert directory[b"a/decently/other_node"] == Content(b"whatever else")
assert directory[b"another_node"] == Content(b"contents")
del directory[b"a/decently/enough/nested/path"]
assert directory.get(b"a/decently/enough/nested/path") is None
assert directory.get(b"a/decently/enough/nested/") is None
assert directory.get(b"a/decently/enough") is None
# no KeyError
directory[b"a/decently"]
directory[b"a"]
directory[b"another_node"]
def test_incremental_noop(swh_storage, datadir, tmp_path):
"""Check that nothing happens if we try to load a repo twice in a row"""
archive_path = Path(datadir, "nominal.tgz")
repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "eventful"}
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "uneventful"}
def test_incremental_nominal(swh_storage, datadir, tmp_path):
"""Check that an updated repository does update after the second run, but
is still a noop in the third run."""
archive_path = Path(datadir, "nominal.tgz")
repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
# remove 2 latest commits
do_uncommit(repo_url)
do_uncommit(repo_url)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "eventful"}
stats = get_stats(swh_storage)
assert stats == {
"content": 6,
"directory": 4,
"origin": 1,
"origin_visit": 1,
"release": 2,
"revision": 4,
"skipped_content": 0,
"snapshot": 1,
}
# Load the complete repo now
repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "eventful"}
stats = get_stats(swh_storage)
expected_stats = {
"content": 7,
"directory": 7,
"origin": 1,
"origin_visit": 2,
"release": 3,
"revision": 6,
"skipped_content": 0,
"snapshot": 2,
}
assert stats == expected_stats
# Nothing should change
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "uneventful"}
stats = get_stats(swh_storage)
assert stats == {**expected_stats, "origin_visit": 2 + 1}
def test_incremental_uncommitted_head(swh_storage, datadir, tmp_path):
"""Check that doing an incremental run with the saved head missing does not
error out but instead loads everything correctly"""
archive_path = Path(datadir, "nominal.tgz")
repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "eventful"}
stats = get_stats(swh_storage)
expected_stats = {
"content": 7,
"directory": 7,
"origin": 1,
"origin_visit": 1,
"release": 3,
"revision": 6,
"skipped_content": 0,
"snapshot": 1,
}
assert stats == expected_stats
# Remove the previously saved head
do_uncommit(repo_url)
loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
res = loader.load()
assert res == {"status": "eventful"}
# Everything is loaded correctly
stats = get_stats(swh_storage)
assert stats == {**expected_stats, "origin_visit": 1 + 1, "snapshot": 1 + 1}

File Metadata

Mime Type
text/x-python
Expires
Jun 4 2025, 7:01 PM (10 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3241883

Event Timeline