Page MenuHomeSoftware Heritage

test_loader.py
No OneTemporary

test_loader.py

# Copyright (C) 2016-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import shutil
import subprocess
import textwrap
from typing import Any, Dict
import pytest
from subvertpy import SubversionException
from swh.loader.svn.loader import (
SvnLoader,
SvnLoaderFromDumpArchive,
SvnLoaderFromRemoteDump,
)
from swh.loader.svn.svn import SvnRepo
from swh.loader.svn.utils import init_svn_repo_from_dump
from swh.loader.tests import (
assert_last_visit_matches,
check_snapshot,
get_stats,
prepare_repository_from_archive,
)
from swh.model.from_disk import DentryPerms
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Snapshot, SnapshotBranch, TargetType
from .utils import CommitChange, CommitChangeType, add_commit
GOURMET_SNAPSHOT = Snapshot(
id=hash_to_bytes("889cacc2731e3312abfb2b1a0c18ade82a949e07"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("4876cb10aec6f708f7466dddf547567b65f6c39c"),
target_type=TargetType.REVISION,
)
},
)
GOURMET_UPDATES_SNAPSHOT = Snapshot(
id=hash_to_bytes("11086d15317014e43d2438b7ffc712c44f1b8afe"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("171dc35522bfd17dda4e90a542a0377fb2fc707a"),
target_type=TargetType.REVISION,
)
},
)
def test_loader_svn_not_found_no_mock(swh_storage, tmp_path):
"""Given an unknown repository, the loader visit ends up in status not_found"""
repo_url = "unknown-repository"
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
swh_storage,
repo_url,
status="not_found",
type="svn",
)
@pytest.mark.parametrize(
"exception_msg",
[
"Unable to connect to a repository at URL",
"Unknown URL type",
],
)
def test_loader_svn_not_found(swh_storage, tmp_path, exception_msg, mocker):
"""Given unknown repository issues, the loader visit ends up in status not_found"""
mock = mocker.patch("swh.loader.svn.loader.SvnRepo")
mock.side_effect = SubversionException(exception_msg, 0)
unknown_repo_url = "unknown-repository"
loader = SvnLoader(swh_storage, unknown_repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
swh_storage,
unknown_repo_url,
status="not_found",
type="svn",
)
@pytest.mark.parametrize(
"exception",
[
SubversionException("Irrelevant message, considered a failure", 10),
SubversionException("Present but fails to read, considered a failure", 20),
ValueError("considered a failure"),
],
)
def test_loader_svn_failures(swh_storage, tmp_path, exception, mocker):
"""Given any errors raised, the loader visit ends up in status failed"""
mock = mocker.patch("swh.loader.svn.loader.SvnRepo")
mock.side_effect = exception
existing_repo_url = "existing-repo-url"
loader = SvnLoader(swh_storage, existing_repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "failed"}
assert_last_visit_matches(
swh_storage,
existing_repo_url,
status="failed",
type="svn",
)
def test_loader_svnrdump_not_found(swh_storage, tmp_path, mocker):
"""Loading from remote dump which does not exist should end up as not_found visit"""
unknown_repo_url = "file:///tmp/svn.code.sf.net/p/white-rats-studios/svn"
loader = SvnLoaderFromRemoteDump(
swh_storage, unknown_repo_url, temp_directory=tmp_path
)
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
swh_storage,
unknown_repo_url,
status="not_found",
type="svn",
)
def test_loader_svnrdump_no_such_revision(swh_storage, tmp_path, datadir):
"""Visit multiple times an origin with the remote loader should not raise.
It used to fail the ingestion on the second visit with a "No such revision x,
160006" message.
"""
archive_ori_dump = os.path.join(datadir, "penguinsdbtools2018.dump.gz")
archive_dump_dir = os.path.join(tmp_path, "dump")
os.mkdir(archive_dump_dir)
archive_dump = os.path.join(archive_dump_dir, "penguinsdbtools2018.dump.gz")
# loader now drops the dump as soon as it's mounted so we need to make a copy first
shutil.copyfile(archive_ori_dump, archive_dump)
loading_path = str(tmp_path / "loading")
os.mkdir(loading_path)
# Prepare the dump as a local svn repository for test purposes
temp_dir, repo_path = init_svn_repo_from_dump(
archive_dump, root_dir=tmp_path, gzip=True
)
repo_url = f"file://{repo_path}"
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=loading_path)
assert loader.load() == {"status": "eventful"}
actual_visit = assert_last_visit_matches(
swh_storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
loader2 = SvnLoaderFromRemoteDump(
swh_storage, repo_url, temp_directory=loading_path
)
# Visiting a second time the same repository should be uneventful...
assert loader2.load() == {"status": "uneventful"}
actual_visit2 = assert_last_visit_matches(
swh_storage,
repo_url,
status="full",
type="svn",
)
assert actual_visit.snapshot is not None
# ... with the same snapshot as the first visit
assert actual_visit2.snapshot == actual_visit.snapshot
def test_loader_svn_new_visit(swh_storage, datadir, tmp_path):
"""Eventful visit should yield 1 snapshot"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(loader.snapshot, loader.storage)
stats = get_stats(loader.storage)
assert stats == {
"content": 19,
"directory": 17,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 6,
"skipped_content": 0,
"snapshot": 1,
}
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
def test_loader_svn_2_visits_no_change(swh_storage, datadir, tmp_path):
"""Visit multiple times a repository with no change should yield the same snapshot"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(loader.snapshot, loader.storage)
assert loader.load() == {"status": "uneventful"}
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot == visit_status2.snapshot
stats = get_stats(loader.storage)
assert stats["origin_visit"] == 1 + 1 # computed twice the same snapshot
assert stats["snapshot"] == 1
# even starting from previous revision...
start_revision = loader.storage.revision_get(
[hash_to_bytes("95edacc8848369d6fb1608e887d6d2474fd5224f")]
)[0]
assert start_revision is not None
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "uneventful"}
stats = get_stats(loader.storage)
assert stats["origin_visit"] == 2 + 1
# ... with no change in repository, this yields the same snapshot
assert stats["snapshot"] == 1
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
def test_loader_tampered_repository(swh_storage, datadir, tmp_path):
"""In this scenario, the dump has been tampered with to modify the
commit log [1]. This results in a hash divergence which is
detected at startup after a new run for the same origin.
In effect, this will perform a complete reloading of the repository.
[1] Tampering with revision 6 log message following:
```
tar xvf pkg-gourmet.tgz # initial repository ingested
cd pkg-gourmet/
echo "Tampering with commit log message for fun and profit" > log.txt
svnadmin setlog . -r 6 log.txt --bypass-hooks
tar cvf pkg-gourmet-tampered-rev6-log.tgz pkg-gourmet/
```
"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(loader.snapshot, loader.storage)
archive_path2 = os.path.join(datadir, "pkg-gourmet-tampered-rev6-log.tgz")
repo_tampered_url = prepare_repository_from_archive(
archive_path2, archive_name, tmp_path
)
loader2 = SvnLoader(
swh_storage, repo_tampered_url, origin_url=repo_url, temp_directory=tmp_path
)
assert loader2.load() == {"status": "eventful"}
assert_last_visit_matches(
loader2.storage,
repo_url,
status="full",
type="svn",
snapshot=hash_to_bytes("5aa61959e788e281fd6e187053d0f46c68e8d8bb"),
)
check_snapshot(loader.snapshot, loader.storage)
stats = get_stats(loader.storage)
assert stats["origin"] == 1
assert stats["origin_visit"] == 2
assert stats["snapshot"] == 2
def test_loader_svn_visit_with_changes(swh_storage, datadir, tmp_path):
"""In this scenario, the repository has been updated with new changes.
The loading visit should result in new objects stored and 1 new
snapshot.
"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_initial_url = prepare_repository_from_archive(
archive_path, archive_name, tmp_path
)
# repo_initial_url becomes the origin_url we want to visit some more below
loader = SvnLoader(swh_storage, repo_initial_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_initial_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
archive_path = os.path.join(datadir, "pkg-gourmet-with-updates.tgz")
repo_updated_url = prepare_repository_from_archive(
archive_path, "pkg-gourmet", tmp_path
)
loader = SvnLoader(
swh_storage,
repo_updated_url,
origin_url=repo_initial_url,
temp_directory=tmp_path,
)
assert loader.load() == {"status": "eventful"}
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_updated_url,
status="full",
type="svn",
snapshot=GOURMET_UPDATES_SNAPSHOT.id,
)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot != visit_status2.snapshot
stats = get_stats(loader.storage)
assert stats == {
"content": 22,
"directory": 28,
"origin": 1,
"origin_visit": 2,
"release": 0,
"revision": 11,
"skipped_content": 0,
"snapshot": 2,
}
check_snapshot(GOURMET_UPDATES_SNAPSHOT, loader.storage)
# Let's start the ingestion from the start, this should yield the same result
loader = SvnLoader(
swh_storage,
repo_updated_url,
origin_url=repo_initial_url,
incremental=False,
temp_directory=tmp_path,
)
assert loader.load() == {"status": "eventful"}
visit_status3 = assert_last_visit_matches(
loader.storage,
repo_updated_url,
status="full",
type="svn",
snapshot=GOURMET_UPDATES_SNAPSHOT.id,
)
assert visit_status2.date < visit_status3.date
assert visit_status3.snapshot == visit_status2.snapshot
check_snapshot(GOURMET_UPDATES_SNAPSHOT, loader.storage)
stats = get_stats(loader.storage)
assert stats["origin"] == 1 # always the same visit
assert stats["origin_visit"] == 2 + 1 # 1 more visit
assert stats["snapshot"] == 2 # no new snapshot
def test_loader_svn_visit_start_from_revision(swh_storage, datadir, tmp_path):
"""Starting from existing revision, next visit on changed repo should yield 1 new
snapshot.
"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_initial_url = prepare_repository_from_archive(
archive_path, archive_name, tmp_path
)
# repo_initial_url becomes the origin_url we want to visit some more below
loader = SvnLoader(swh_storage, repo_initial_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
visit_status1 = assert_last_visit_matches(
loader.storage,
repo_initial_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
start_revision = loader.storage.revision_get(
[hash_to_bytes("95edacc8848369d6fb1608e887d6d2474fd5224f")]
)[0]
assert start_revision is not None
archive_path = os.path.join(datadir, "pkg-gourmet-with-updates.tgz")
repo_updated_url = prepare_repository_from_archive(
archive_path, "pkg-gourmet", tmp_path
)
# we'll start from start_revision
loader = SvnLoader(
swh_storage,
repo_updated_url,
origin_url=repo_initial_url,
temp_directory=tmp_path,
)
assert loader.load() == {"status": "eventful"}
# nonetheless, we obtain the same snapshot (as previous tests on that repository)
visit_status2 = assert_last_visit_matches(
loader.storage,
repo_updated_url,
status="full",
type="svn",
snapshot=GOURMET_UPDATES_SNAPSHOT.id,
)
assert visit_status1.date < visit_status2.date
assert visit_status1.snapshot != visit_status2.snapshot
stats = get_stats(loader.storage)
assert stats == {
"content": 22,
"directory": 28,
"origin": 1,
"origin_visit": 2,
"release": 0,
"revision": 11,
"skipped_content": 0,
"snapshot": 2,
}
check_snapshot(GOURMET_UPDATES_SNAPSHOT, loader.storage)
def test_loader_svn_visit_with_eol_style(swh_storage, datadir, tmp_path):
"""Check that a svn repo containing a versioned file with CRLF line
endings with svn:eol-style property set to 'native' (this is a
violation of svn specification as the file should have been
stored with LF line endings) can be loaded anyway.
"""
archive_name = "mediawiki-repo-r407-eol-native-crlf"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
mediawiki_snapshot = Snapshot(
id=hash_to_bytes("d6d6e9703f157c5702d9a4a5dec878926ed4ab76"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("7da4975c363101b819756d33459f30a866d01b1b"),
target_type=TargetType.REVISION,
)
},
)
check_snapshot(mediawiki_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=mediawiki_snapshot.id,
)
stats = get_stats(loader.storage)
assert stats["origin"] == 1
assert stats["origin_visit"] == 1
assert stats["snapshot"] == 1
def test_loader_svn_visit_with_mixed_crlf_lf(swh_storage, datadir, tmp_path):
"""Check that a svn repo containing a versioned file with mixed
CRLF/LF line endings with svn:eol-style property set to 'native'
(this is a violation of svn specification as mixed line endings
for textual content should not be stored when the svn:eol-style
property is set) can be loaded anyway.
"""
archive_name = "pyang-repo-r343-eol-native-mixed-lf-crlf"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
pyang_snapshot = Snapshot(
id=hash_to_bytes("6d9590de11b00a5801de0ff3297c5b44bbbf7d24"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("9c6962eeb9164a636c374be700672355e34a98a7"),
target_type=TargetType.REVISION,
)
},
)
check_snapshot(pyang_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=pyang_snapshot.id,
)
stats = get_stats(loader.storage)
assert stats["origin"] == 1
assert stats["origin_visit"] == 1
assert stats["snapshot"] == 1
def test_loader_svn_with_symlink(swh_storage, datadir, tmp_path):
"""Repository with symlinks should be ingested ok
Edge case:
- first create a file and commit it.
Remove it, then add folder holding the same name, commit.
- do the same scenario with symbolic link (instead of file)
"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(
datadir, "pkg-gourmet-with-edge-case-links-and-files.tgz"
)
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
gourmet_edge_cases_snapshot = Snapshot(
id=hash_to_bytes("18e60982fe521a2546ab8c3c73a535d80462d9d0"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("3f43af2578fccf18b0d4198e48563da7929dc608"),
target_type=TargetType.REVISION,
)
},
)
check_snapshot(gourmet_edge_cases_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=gourmet_edge_cases_snapshot.id,
)
stats = get_stats(loader.storage)
assert stats["origin"] == 1
assert stats["origin_visit"] == 1
assert stats["snapshot"] == 1
assert stats["revision"] == 19
def test_loader_svn_with_wrong_symlinks(swh_storage, datadir, tmp_path):
"""Repository with wrong symlinks should be ingested ok nonetheless
Edge case:
- wrong symbolic link
- wrong symbolic link with empty space names
"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, "pkg-gourmet-with-wrong-link-cases.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
gourmet_wrong_links_snapshot = Snapshot(
id=hash_to_bytes("b17f38acabb90f066dedd30c29f01a02af88a5c4"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("cf30d3bb9d5967d0a2bbeacc405f10a5dd9b138a"),
target_type=TargetType.REVISION,
)
},
)
check_snapshot(gourmet_wrong_links_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=gourmet_wrong_links_snapshot.id,
)
stats = get_stats(loader.storage)
assert stats["origin"] == 1
assert stats["origin_visit"] == 1
assert stats["snapshot"] == 1
assert stats["revision"] == 21
def test_loader_svn_cleanup_loader(swh_storage, datadir, tmp_path):
"""Loader should clean up its working directory after the load"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loading_temp_directory = str(tmp_path / "loading")
os.mkdir(loading_temp_directory)
loader = SvnLoader(swh_storage, repo_url, temp_directory=loading_temp_directory)
assert loader.load() == {"status": "eventful"}
# the root temporary directory still exists
assert os.path.exists(loader.temp_directory)
# but it should be empty
assert os.listdir(loader.temp_directory) == []
def test_loader_svn_cleanup_loader_from_remote_dump(swh_storage, datadir, tmp_path):
"""Loader should clean up its working directory after the load"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loading_temp_directory = str(tmp_path / "loading")
os.mkdir(loading_temp_directory)
loader = SvnLoaderFromRemoteDump(
swh_storage, repo_url, temp_directory=loading_temp_directory
)
assert loader.load() == {"status": "eventful"}
# the root temporary directory still exists
assert os.path.exists(loader.temp_directory)
# but it should be empty
assert os.listdir(loader.temp_directory) == []
# the internal temp_dir should be cleaned up though
assert not os.path.exists(loader.temp_dir)
def test_loader_svn_cleanup_loader_from_dump_archive(swh_storage, datadir, tmp_path):
"""Loader should clean up its working directory after the load"""
archive_ori_dump = os.path.join(datadir, "penguinsdbtools2018.dump.gz")
archive_dump_dir = os.path.join(tmp_path, "dump")
os.mkdir(archive_dump_dir)
archive_dump = os.path.join(archive_dump_dir, "penguinsdbtools2018.dump.gz")
# loader now drops the dump as soon as it's mounted so we need to make a copy first
shutil.copyfile(archive_ori_dump, archive_dump)
loading_path = str(tmp_path / "loading")
os.mkdir(loading_path)
# Prepare the dump as a local svn repository for test purposes
temp_dir, repo_path = init_svn_repo_from_dump(
archive_dump, root_dir=tmp_path, gzip=True
)
repo_url = f"file://{repo_path}"
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=loading_path)
assert loader.load() == {"status": "eventful"}
# the root temporary directory still exists
assert os.path.exists(loader.temp_directory)
# but it should be empty
assert os.listdir(loader.temp_directory) == []
# the internal temp_dir should be cleaned up though
assert not os.path.exists(loader.temp_dir)
def test_svn_loader_from_remote_dump(swh_storage, datadir, tmpdir_factory):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
tmp_path = tmpdir_factory.mktemp("repo1")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loaderFromDump = SvnLoaderFromRemoteDump(
swh_storage, repo_url, temp_directory=tmp_path
)
assert loaderFromDump.load() == {"status": "eventful"}
assert_last_visit_matches(
loaderFromDump.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
# rename to another origin
tmp_path = tmpdir_factory.mktemp("repo2")
origin_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(
swh_storage, repo_url, origin_url=origin_url, temp_directory=tmp_path
)
assert loader.load() == {"status": "eventful"} # because are working on new origin
assert_last_visit_matches(
loader.storage,
origin_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
stats = get_stats(loader.storage)
assert stats["origin"] == 2 # created one more origin
assert stats["origin_visit"] == 2
assert stats["snapshot"] == 1
loader = SvnLoader(
swh_storage, repo_url, temp_directory=tmp_path
) # no change on the origin-url
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
loader.storage,
origin_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
stats = get_stats(loader.storage)
assert stats["origin"] == 2
assert stats["origin_visit"] == 3
assert stats["snapshot"] == 1
# second visit from the dump should be uneventful
loaderFromDump = SvnLoaderFromRemoteDump(
swh_storage, repo_url, temp_directory=tmp_path
)
assert loaderFromDump.load() == {"status": "uneventful"}
def test_svn_loader_from_remote_dump_incremental_load_on_stale_repo(
swh_storage, datadir, tmp_path, mocker
):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
# first load: a dump file will be created, mounted to a local repository
# and the latter will be loaded into the archive
loaderFromDump = SvnLoaderFromRemoteDump(
swh_storage, repo_url, temp_directory=tmp_path
)
assert loaderFromDump.load() == {"status": "eventful"}
assert_last_visit_matches(
loaderFromDump.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
# second load on same repository: the loader will detect there is no changes
# since last load and will skip the dump, mount and load phases
loaderFromDump = SvnLoaderFromRemoteDump(
swh_storage, repo_url, temp_directory=tmp_path
)
loaderFromDump.dump_svn_revisions = mocker.MagicMock()
init_svn_repo_from_dump = mocker.patch(
"swh.loader.svn.loader.init_svn_repo_from_dump"
)
loaderFromDump.process_svn_revisions = mocker.MagicMock()
loaderFromDump._check_revision_divergence = mocker.MagicMock()
assert loaderFromDump.load() == {"status": "uneventful"}
assert_last_visit_matches(
loaderFromDump.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
# no dump
loaderFromDump.dump_svn_revisions.assert_not_called()
# no mount
init_svn_repo_from_dump.assert_not_called()
# no loading
loaderFromDump.process_svn_revisions.assert_not_called()
# no redundant post_load processing
loaderFromDump._check_revision_divergence.assert_not_called()
def test_svn_loader_from_remote_dump_incremental_load_on_non_stale_repo(
swh_storage, datadir, tmp_path, mocker
):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
# first load
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path)
loader.load()
archive_path = os.path.join(datadir, "pkg-gourmet-with-updates.tgz")
repo_updated_url = prepare_repository_from_archive(
archive_path, archive_name, tmp_path
)
# second load
loader = SvnLoaderFromRemoteDump(
swh_storage, repo_updated_url, temp_directory=tmp_path
)
dump_svn_revisions = mocker.spy(loader, "dump_svn_revisions")
process_svn_revisions = mocker.spy(loader, "process_svn_revisions")
loader.load()
dump_svn_revisions.assert_called()
process_svn_revisions.assert_called()
def test_loader_user_defined_svn_properties(swh_storage, datadir, tmp_path):
"""Edge cases: The repository held some user defined svn-properties with special
encodings, this prevented the repository from being loaded even though we do not
ingest those information.
"""
archive_name = "httthttt"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url)
assert loader.load() == {"status": "eventful"}
expected_snapshot = Snapshot(
id=hash_to_bytes("70487267f682c07e52a2371061369b6cf5bffa47"),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes("604a17dbb15e8d7ecb3e9f3768d09bf493667a93"),
target_type=TargetType.REVISION,
)
},
)
check_snapshot(expected_snapshot, loader.storage)
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=expected_snapshot.id,
)
stats = get_stats(loader.storage)
assert stats["origin"] == 1
assert stats["origin_visit"] == 1
assert stats["snapshot"] == 1
assert stats["revision"] == 7
def test_loader_svn_dir_added_then_removed(swh_storage, datadir, tmp_path):
"""Loader should handle directory removal when processing a commit"""
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}-add-remove-dir.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_loader_svn_loader_from_dump_archive(swh_storage, datadir, tmp_path):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
dump_filename = f"{archive_name}.dump"
with open(os.path.join(tmp_path, dump_filename), "wb") as dump_file:
# create compressed dump file of pkg-gourmet repo
subprocess.run(["svnrdump", "dump", repo_url], stdout=dump_file)
subprocess.run(["gzip", dump_filename], cwd=tmp_path)
# load svn repo from that compressed dump file
loader = SvnLoaderFromDumpArchive(
swh_storage,
url=repo_url,
archive_path=os.path.join(tmp_path, f"{dump_filename}.gz"),
temp_directory=tmp_path,
)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
assert get_stats(loader.storage) == {
"content": 19,
"directory": 17,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 6,
"skipped_content": 0,
"snapshot": 1,
}
def test_loader_eol_style_file_property_handling_edge_case(
swh_storage, repo_url, tmp_path
):
# # first commit
add_commit(
repo_url,
(
"Add a directory containing a file with CRLF end of line "
"and set svn:eol-style property to native so CRLF will be "
"replaced by LF in the file when exporting the revision"
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="directory/file_with_crlf_eol.txt",
properties={"svn:eol-style": "native"},
data=b"Hello world!\r\n",
)
],
)
# second commit
add_commit(
repo_url,
"Remove previously added directory and file",
[
CommitChange(
change_type=CommitChangeType.Delete,
path="directory/",
)
],
)
# third commit
add_commit(
repo_url,
(
"Add again same directory containing same file with CRLF end of line "
"but do not set svn:eol-style property value so CRLF will not be "
"replaced by LF when exporting the revision"
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="directory/file_with_crlf_eol.txt",
data=b"Hello world!\r\n",
)
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
assert get_stats(loader.storage) == {
"content": 2,
"directory": 5,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 3,
"skipped_content": 0,
"snapshot": 1,
}
def get_head_revision_paths_info(loader: SvnLoader) -> Dict[bytes, Dict[str, Any]]:
assert loader.snapshot is not None
root_dir = loader.snapshot.branches[b"HEAD"].target
revision = loader.storage.revision_get([root_dir])[0]
assert revision is not None
paths = {}
for entry in loader.storage.directory_ls(revision.directory, recursive=True):
paths[entry["name"]] = entry
return paths
def test_loader_eol_style_on_svn_link_handling(swh_storage, repo_url, tmp_path):
# first commit
add_commit(
repo_url,
(
"Add a regular file, a directory and a link to the regular file "
"in the directory. Set svn:eol-style property for the regular "
"file and the link. Set svn:special property for the link."
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="file_with_crlf_eol.txt",
properties={"svn:eol-style": "native"},
data=b"Hello world!\r\n",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="directory/file_with_crlf_eol.txt",
properties={"svn:eol-style": "native", "svn:special": "*"},
data=b"link ../file_with_crlf_eol.txt",
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
# check loaded objects are those expected
assert get_stats(loader.storage) == {
"content": 2,
"directory": 2,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 1,
"skipped_content": 0,
"snapshot": 1,
}
paths = get_head_revision_paths_info(loader)
assert (
loader.storage.content_get_data(paths[b"file_with_crlf_eol.txt"]["sha1"])
== b"Hello world!\n"
)
assert paths[b"directory/file_with_crlf_eol.txt"]["perms"] == DentryPerms.symlink
assert (
loader.storage.content_get_data(
paths[b"directory/file_with_crlf_eol.txt"]["sha1"]
)
== b"../file_with_crlf_eol.txt"
)
def test_loader_svn_special_property_unset(swh_storage, repo_url, tmp_path):
# first commit
add_commit(
repo_url,
(
"Create a regular file, a link to a file and a link to an "
"external file. Set the svn:special property on the links."
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="file.txt",
data=b"Hello world!\n",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="link.txt",
properties={"svn:special": "*"},
data=b"link ./file.txt",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="external_link.txt",
properties={"svn:special": "*"},
data=b"link /home/user/data.txt",
),
],
)
# second commit
add_commit(
repo_url,
"Unset the svn:special property on the links.",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="link.txt",
properties={"svn:special": None},
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="external_link.txt",
properties={"svn:special": None},
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
# check loaded objects are those expected
assert get_stats(loader.storage) == {
"content": 5,
"directory": 2,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 2,
"skipped_content": 0,
"snapshot": 1,
}
paths = get_head_revision_paths_info(loader)
assert paths[b"link.txt"]["perms"] == DentryPerms.content
assert (
loader.storage.content_get_data(paths[b"link.txt"]["sha1"])
== b"link ./file.txt"
)
assert paths[b"external_link.txt"]["perms"] == DentryPerms.content
assert (
loader.storage.content_get_data(paths[b"external_link.txt"]["sha1"])
== b"link /home/user/data.txt"
)
def test_loader_invalid_svn_eol_style_property_value(swh_storage, repo_url, tmp_path):
filename = "file_with_crlf_eol.txt"
file_content = b"Hello world!\r\n"
# # first commit
add_commit(
repo_url,
(
"Add a file with CRLF end of line and set svn:eol-style property "
"to an invalid value."
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path=filename,
properties={"svn:eol-style": "foo"},
data=file_content,
)
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
paths = get_head_revision_paths_info(loader)
# end of lines should not have been processed
assert (
loader.storage.content_get_data(paths[filename.encode()]["sha1"])
== file_content
)
def test_loader_first_revision_is_not_number_one(
swh_storage, mocker, repo_url, tmp_path
):
class SvnRepoSkipFirstRevision(SvnRepo):
def logs(self, revision_start, revision_end):
"""Overrides logs method to skip revision number one in yielded revisions"""
yield from super().logs(revision_start + 1, revision_end)
from swh.loader.svn import loader
mocker.patch.object(loader, "SvnRepo", SvnRepoSkipFirstRevision)
for filename in ("foo", "bar", "baz"):
add_commit(
repo_url,
f"Add {filename} file",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path=filename,
data=f"{filename}\n".encode(),
)
],
)
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
# post loading will detect an issue and make a partial visit with a snapshot
assert loader.load() == {"status": "failed"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="partial",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
assert get_stats(loader.storage) == {
"content": 2,
"directory": 2,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 2,
"skipped_content": 0,
"snapshot": 1,
}
def test_loader_svn_special_property_on_binary_file(swh_storage, repo_url, tmp_path):
"""When a file has the svn:special property set but is not a svn link,
it might be truncated under certain conditions when performing an export
operation."""
data = (
b"!<symlink>\xff\xfea\x00p\x00t\x00-\x00c\x00y\x00g\x00.\x00s\x00h\x00\x00\x00"
)
# first commit
add_commit(
repo_url,
(
"Add a non svn link binary file and set the svn:special property on it."
"That file will be truncated when exporting it."
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="binary_file",
properties={"svn:special": "*"},
data=data,
),
],
)
# second commit
add_commit(
repo_url,
(
"Add a non svn link binary file and set the svn:special and "
"svn:mime-type properties on it."
"That file will not be truncated when exporting it."
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="another_binary_file",
properties={
"svn:special": "*",
"svn:mime-type": "application/octet-stream",
},
data=data,
),
],
)
# third commit
add_commit(
repo_url,
"Remove the svn:special property on the previously added files",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="binary_file",
properties={"svn:special": None},
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="another_binary_file",
properties={"svn:special": None},
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_loader_last_revision_divergence(swh_storage, datadir, tmp_path):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
class SvnLoaderRevisionDivergence(SvnLoader):
def _check_revision_divergence(self, count, rev, dir_id):
raise ValueError("revision divergence detected")
loader = SvnLoaderRevisionDivergence(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "failed"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="partial",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
def test_loader_delete_directory_while_file_has_same_prefix(
swh_storage, repo_url, tmp_path
):
# first commit
add_commit(
repo_url,
"Add a file and a directory with same prefix",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="foo/bar.c",
data=b'#include "../foo.c"',
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="foo.c",
data=b"int foo() {return 0;}",
),
],
)
# second commit
add_commit(
repo_url,
"Delete previously added directory and update file content",
[
CommitChange(change_type=CommitChangeType.Delete, path="foo"),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="foo.c",
data=b"int foo() {return 1;}",
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_svn_loader_incremental(swh_storage, repo_url, tmp_path):
# first commit
add_commit(
repo_url,
(
"Add a directory containing a file with CRLF end of line "
"and set svn:eol-style property to native so CRLF will be "
"replaced by LF in the file when exporting the revision"
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="file_with_crlf_eol.txt",
properties={"svn:eol-style": "native"},
data=b"Hello world!\r\n",
)
],
)
# first load
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
# second commit
add_commit(
repo_url,
"Modify previously added file",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="file_with_crlf_eol.txt",
data=b"Hello World!\r\n",
)
],
)
# second load, incremental
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
# third commit
add_commit(
repo_url,
"Unset svn:eol-style property on file",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="file_with_crlf_eol.txt",
properties={"svn:eol-style": None},
)
],
)
# third load, incremental
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_svn_loader_incremental_replay_start_with_empty_directory(
swh_storage, mocker, repo_url, tmp_path
):
# first commit
add_commit(
repo_url,
("Add a file"),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="foo.txt",
data=b"foo\n",
)
],
)
# first load
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
# second commit
add_commit(
repo_url,
"Modify previously added file",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="foo.txt",
data=b"bar\n",
)
],
)
class SvnRepoCheckReplayStartWithEmptyDirectory(SvnRepo):
def swh_hash_data_per_revision(self, start_revision: int, end_revision: int):
"""Overrides swh_hash_data_per_revision method to grab the content
of the directory where the svn revisions will be replayed before that
process starts."""
self.replay_dir_content_before_start = [
os.path.join(root, name)
for root, _, files in os.walk(self.local_url)
for name in files
]
yield from super().swh_hash_data_per_revision(start_revision, end_revision)
from swh.loader.svn import loader
mocker.patch.object(loader, "SvnRepo", SvnRepoCheckReplayStartWithEmptyDirectory)
# second load, incremental
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path)
loader.load()
# check work directory was empty before replaying revisions
assert loader.svnrepo.replay_dir_content_before_start == []
def test_loader_svn_executable_property_on_svn_link_handling(
swh_storage, repo_url, tmp_path
):
# first commit
add_commit(
repo_url,
(
"Add an executable file and a svn link to it."
"Set svn:executable property for both paths."
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello-world",
properties={"svn:executable": "*"},
data=b"#!/bin/bash\necho Hello World !",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:executable": "*", "svn:special": "*"},
data=b"link hello-world",
),
],
)
# second commit
add_commit(
repo_url,
(
"Remove executable file, unset link and replace it with executable content."
"As the link was previously marked as executable, execution rights should"
"be set after turning it to a regular file."
),
[
CommitChange(change_type=CommitChangeType.Delete, path="hello-world"),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:special": None},
data=b"#!/bin/bash\necho Hello World !",
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_loader_svn_add_property_on_link(swh_storage, repo_url, tmp_path):
# first commit
add_commit(
repo_url,
"Add an executable file and a svn link to it.",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello-world",
properties={"svn:executable": "*"},
data=b"#!/bin/bash\necho Hello World !",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:special": "*"},
data=b"link hello-world",
),
],
)
# second commit
add_commit(
repo_url,
"Set svn:eol-style property on link",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:eol-style": "native"},
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_loader_svn_link_parsing(swh_storage, repo_url, tmp_path):
# first commit
add_commit(
repo_url,
"Add an executable file and a svn link to it.",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello-world",
properties={"svn:executable": "*"},
data=b"#!/bin/bash\necho Hello World !",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:special": "*"},
data=b"link hello-world",
),
],
)
# second commit
add_commit(
repo_url,
"Update svn link content",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
data=b"link hello-world\r\n",
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_loader_svn_empty_local_dir_before_post_load(swh_storage, datadir, tmp_path):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
class SvnLoaderPostLoadLocalDirIsEmpty(SvnLoader):
def post_load(self, success=True):
if success:
self.local_dirname_content = [
os.path.join(root, name)
for root, _, files in os.walk(self.svnrepo.local_dirname)
for name in files
]
return super().post_load(success)
loader = SvnLoaderPostLoadLocalDirIsEmpty(
swh_storage, repo_url, temp_directory=tmp_path
)
assert loader.load() == {"status": "eventful"}
assert loader.local_dirname_content == []
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(GOURMET_SNAPSHOT, loader.storage)
def _dump_project(tmp_path, origin_url):
svnrdump_cmd = ["svnrdump", "dump", origin_url]
dump_path = f"{tmp_path}/repo.dump"
with open(dump_path, "wb") as dump_file:
subprocess.run(svnrdump_cmd, stdout=dump_file)
subprocess.run(["gzip", dump_path])
return dump_path + ".gz"
def test_loader_svn_add_property_on_directory_link(swh_storage, repo_url, tmp_path):
# first commit
add_commit(
repo_url,
"Add an executable file in a directory and a svn link to the directory.",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="code/hello-world",
properties={"svn:executable": "*"},
data=b"#!/bin/bash\necho Hello World !",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:special": "*"},
data=b"link code",
),
],
)
# second commit
add_commit(
repo_url,
"Set svn:eol-style property on link",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="hello",
properties={"svn:eol-style": "native"},
),
],
)
# instantiate a svn loader checking after each processed revision that
# the repository filesystem it reconstructed does not differ from a subversion
# export of that revision
loader = SvnLoader(swh_storage, repo_url, temp_directory=tmp_path, check_revision=1)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
@pytest.mark.parametrize(
"svn_loader_cls", [SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump]
)
def test_loader_with_subprojects(swh_storage, repo_url, tmp_path, svn_loader_cls):
# first commit
add_commit(
repo_url,
"Add first project in repository",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="project1/foo.sh",
data=b"#!/bin/bash\necho foo",
),
],
)
# second commit
add_commit(
repo_url,
"Add second project in repository",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="project2/bar.sh",
data=b"#!/bin/bash\necho bar",
),
],
)
# third commit
add_commit(
repo_url,
"Add third project in repository",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="project3/baz.sh",
data=b"#!/bin/bash\necho baz",
),
],
)
for i in range(1, 4):
# load each project in the repository separately and check behavior
# is the same if origin URL has a trailing slash or not
origin_url = f"{repo_url}/project{i}{'/' if i%2 else ''}"
loader_params = {
"storage": swh_storage,
"url": origin_url,
"origin_url": origin_url,
"temp_directory": tmp_path,
"incremental": True,
"check_revision": 1,
}
if svn_loader_cls == SvnLoaderFromDumpArchive:
loader_params["archive_path"] = _dump_project(tmp_path, origin_url)
loader = svn_loader_cls(**loader_params)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
origin_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
if svn_loader_cls == SvnLoaderFromDumpArchive:
loader_params["archive_path"] = _dump_project(tmp_path, origin_url)
loader = svn_loader_cls(**loader_params)
assert loader.load() == {"status": "uneventful"}
# each project origin must have
assert get_stats(loader.storage) == {
"content": i, # one content
"directory": 2 * i, # two directories
"origin": i,
"origin_visit": 2 * i, # two visits
"release": 0,
"revision": i, # one revision
"skipped_content": 0,
"snapshot": i, # one snapshot
}
@pytest.mark.parametrize(
"svn_loader_cls", [SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump]
)
def test_loader_subproject_root_dir_removal(
swh_storage, repo_url, tmp_path, svn_loader_cls
):
# first commit
add_commit(
repo_url,
"Add project in repository",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="project/foo.sh",
data=b"#!/bin/bash\necho foo",
),
],
)
# second commit
add_commit(
repo_url,
"Remove project root directory",
[CommitChange(change_type=CommitChangeType.Delete, path="project/")],
)
# third commit
add_commit(
repo_url,
"Re-add project in repository",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="project/foo.sh",
data=b"#!/bin/bash\necho foo",
),
],
)
origin_url = f"{repo_url}/project"
loader_params = {
"storage": swh_storage,
"url": origin_url,
"origin_url": origin_url,
"temp_directory": tmp_path,
"incremental": True,
"check_revision": 1,
}
if svn_loader_cls == SvnLoaderFromDumpArchive:
loader_params["archive_path"] = _dump_project(tmp_path, origin_url)
loader = svn_loader_cls(**loader_params)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
origin_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
if svn_loader_cls == SvnLoaderFromDumpArchive:
loader_params["archive_path"] = _dump_project(tmp_path, origin_url)
loader = svn_loader_cls(**loader_params)
assert loader.load() == {"status": "uneventful"}
@pytest.mark.parametrize("svn_loader_cls", [SvnLoader, SvnLoaderFromRemoteDump])
def test_loader_svn_not_found_after_successful_visit(
swh_storage, datadir, tmp_path, svn_loader_cls
):
archive_name = "pkg-gourmet"
archive_path = os.path.join(datadir, f"{archive_name}.tgz")
repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
loader = svn_loader_cls(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
snapshot=GOURMET_SNAPSHOT.id,
)
check_snapshot(loader.snapshot, loader.storage)
# simulate removal of remote repository
shutil.rmtree(repo_url.replace("file://", ""))
loader = svn_loader_cls(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "uneventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="not_found",
type="svn",
snapshot=None,
)
def test_loader_svn_from_remote_dump_url_redirect(swh_storage, tmp_path, mocker):
repo_url = "http://svn.example.org/repo"
repo_redirect_url = "https://svn.example.org/repo"
# mock remote subversion operations
from swh.loader.svn.svn import client
mocker.patch("swh.loader.svn.svn.RemoteAccess")
init_svn_repo_from_dump = mocker.patch(
"swh.loader.svn.loader.init_svn_repo_from_dump"
)
init_svn_repo_from_dump.return_value = ("", "")
mock_client = mocker.MagicMock()
mocker.patch.object(client, "Client", mock_client)
class Info:
repos_root_url = repo_redirect_url
url = repo_redirect_url
mock_client().info.return_value = {"repo": Info()}
# init remote dump loader and mock some methods
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path)
loader.dump_svn_revisions = mocker.MagicMock(return_value=("", -1))
loader.start_from = mocker.MagicMock(return_value=(0, 0))
# prepare loading
loader.prepare()
# check redirection URL has been used to dump repository
assert loader.dump_svn_revisions.call_args_list[0][0][0] == repo_redirect_url
@pytest.mark.parametrize("svn_loader_cls", [SvnLoader, SvnLoaderFromRemoteDump])
def test_loader_basic_authentication_required(
swh_storage, repo_url, tmp_path, svn_loader_cls, svnserve
):
# add file to empty test repo
add_commit(
repo_url,
"Add project in repository",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="project/foo.sh",
data=b"#!/bin/bash\necho foo",
),
],
)
# compute repo URLs that will be made available by svnserve
repo_path = repo_url.replace("file://", "")
repo_root = os.path.dirname(repo_path)
repo_name = os.path.basename(repo_path)
username = "anonymous"
password = "anonymous"
port = 12000
repo_url_no_auth = f"svn://localhost:{port}/{repo_name}"
repo_url = f"svn://{username}:{password}@localhost:{port}/{repo_name}"
# disable anonymous access and require authentication on test repo
with open(os.path.join(repo_path, "conf", "svnserve.conf"), "w") as svnserve_conf:
svnserve_conf.write(
textwrap.dedent(
"""
[general]
# Authentication realm of the repository.
realm = test-repository
password-db = passwd
# Deny all anonymous access
anon-access = none
# Grant authenticated users read and write privileges
auth-access = write
"""
)
)
# add a user with read/write access on test repo
with open(os.path.join(repo_path, "conf", "passwd"), "w") as passwd:
passwd.write(f"[users]\n{username} = {password}")
# execute svnserve
svnserve(repo_root, port)
# check loading failed with no authentication
loader = svn_loader_cls(swh_storage, repo_url_no_auth, temp_directory=tmp_path)
assert loader.load() == {"status": "uneventful"}
# check loading succeeded with authentication
loader = svn_loader_cls(swh_storage, repo_url, temp_directory=tmp_path)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)
def test_loader_with_spaces_in_svn_url(swh_storage, repo_url, tmp_path):
filename = "file with spaces.txt"
content = b"foo"
add_commit(
repo_url,
"Add file with spaces in its name",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path=filename,
data=content,
),
],
)
svnrepo = SvnRepo(repo_url, repo_url, tmp_path, max_content_length=10000)
dest_path = f"{tmp_path}/file"
svnrepo.export(f"{repo_url}/{filename}", to=dest_path)
with open(dest_path, "rb") as f:
assert f.read() == content
@pytest.mark.parametrize("svn_loader_cls", [SvnLoader, SvnLoaderFromRemoteDump])
def test_loader_repo_with_copyfrom_and_replace_operations(
swh_storage, repo_url, tmp_path, svn_loader_cls
):
add_commit(
repo_url,
"Create trunk/data folder",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/data/foo",
data=b"foo",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/data/bar",
data=b"bar",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/data/baz/",
),
],
)
add_commit(
repo_url,
"Create trunk/project folder",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/project/",
),
],
)
add_commit(
repo_url,
"Create trunk/project/bar as copy of trunk/data/bar from revision 1",
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/project/bar",
copyfrom_path=repo_url + "/trunk/data/bar",
copyfrom_rev=1,
),
],
)
add_commit(
repo_url,
(
"Create trunk/project/data/ folder as a copy of /trunk/data from revision 1"
" and replace the trunk/project/data/baz/ folder by a trunk/project/data/baz file"
),
[
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/project/data/",
copyfrom_path=repo_url + "/trunk/data/",
copyfrom_rev=1,
),
CommitChange(
change_type=CommitChangeType.Delete,
path="trunk/project/data/baz/",
),
CommitChange(
change_type=CommitChangeType.AddOrUpdate,
path="trunk/project/data/baz",
data=b"baz",
),
],
)
loader = svn_loader_cls(
swh_storage, repo_url, temp_directory=tmp_path, check_revision=1
)
assert loader.load() == {"status": "eventful"}
assert_last_visit_matches(
loader.storage,
repo_url,
status="full",
type="svn",
)
check_snapshot(loader.snapshot, loader.storage)

File Metadata

Mime Type
text/x-python
Expires
Fri, Jul 4, 11:13 AM (3 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3295217

Event Timeline