diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,5 @@ include Makefile include requirements*.txt include version.txt -recursive-include swh/loader/git/tests/data *.xz -recursive-include swh/loader/git/tests/resources/ * +recursive-include swh/loader/git/tests/data * recursive-include swh py.typed diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ pytest pytest-mock swh.scheduler[testing] +swh.storage[testing] diff --git a/swh/loader/git/tests/__init__.py b/swh/loader/git/tests/__init__.py --- a/swh/loader/git/tests/__init__.py +++ b/swh/loader/git/tests/__init__.py @@ -1,27 +1,23 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -TEST_LOADER_CONFIG = { - "storage": { - "cls": "pipeline", - "steps": [ - {"cls": "filter"}, - { - "cls": "buffer", - "min_batch_size": { - "content": 10, - "content_bytes": 100 * 1024 * 1024, - "directory": 10, - "revision": 10, - "release": 10, - }, - }, - {"cls": "memory"}, - ], - }, - "max_content_size": 100 * 1024 * 1024, - "pack_size_bytes": 4 * 1024 * 1024 * 1024, - "save_data": False, -} +import os +import subprocess + +from typing import Optional + + +def prepare_repository_from_archive( + archive_path: str, + filename: Optional[str] = None, + tmp_path: str = "/tmp", + uncompress_archive: bool = True, +) -> str: + if uncompress_archive: + # uncompress folder/repositories/dump for the loader to ingest + subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) + # build the origin url (or some derivative form) + _fname = filename if filename else os.path.basename(archive_path) + return f"file://{tmp_path}/{_fname}" diff --git a/swh/loader/git/tests/conftest.py b/swh/loader/git/tests/conftest.py --- a/swh/loader/git/tests/conftest.py +++ b/swh/loader/git/tests/conftest.py @@ -1,11 +1,53 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import yaml + import pytest +from typing import Any, Dict + from swh.scheduler.tests.conftest import * # noqa +from swh.storage.tests.conftest import * # noqa + + +@pytest.fixture +def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: + swh_storage_backend_config["journal_writer"] = {} + return { + "storage": { + "cls": "pipeline", + "steps": [ + {"cls": "filter"}, + { + "cls": "buffer", + "min_batch_size": { + "content": 10, + "content_bytes": 100 * 1024 * 1024, + "directory": 10, + "revision": 10, + "release": 10, + }, + }, + swh_storage_backend_config, + ], + }, + "max_content_size": 100 * 1024 * 1024, + "pack_size_bytes": 4 * 1024 * 1024 * 1024, + "save_data": False, + } + + +@pytest.fixture +def swh_config(swh_loader_config, monkeypatch, tmp_path): + conffile = os.path.join(str(tmp_path), "loader.yml") + with open(conffile, "w") as f: + f.write(yaml.dump(swh_loader_config)) + monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) + return conffile @pytest.fixture(scope="session") # type: ignore # expected redefinition diff --git a/swh/loader/git/tests/resources/testrepo.tgz b/swh/loader/git/tests/data/testrepo.tgz rename from swh/loader/git/tests/resources/testrepo.tgz rename to swh/loader/git/tests/data/testrepo.tgz diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -3,38 +3,24 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import datetime import os.path import dulwich.repo +import pytest + +from unittest import TestCase from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.model.hashutil import hash_to_bytes - -from swh.loader.core.tests import BaseLoaderTest from swh.loader.tests.common import assert_last_visit_matches +from swh.loader.git.from_disk import GitLoaderFromDisk +from swh.loader.git.from_disk import GitLoaderFromArchive +from swh.loader.package.tests.common import check_snapshot, get_stats -from swh.loader.git.from_disk import GitLoaderFromDisk as OrigGitLoaderFromDisk -from swh.loader.git.from_disk import GitLoaderFromArchive as OrigGitLoaderFromArchive - -from . import TEST_LOADER_CONFIG - - -class GitLoaderFromArchive(OrigGitLoaderFromArchive): - def project_name_from_archive(self, archive_path): - # We don't want the project name to be 'resources'. - return "testrepo" +from swh.loader.git.tests import prepare_repository_from_archive - def parse_config_file(self, *args, **kwargs): - return TEST_LOADER_CONFIG - - -CONTENT1 = { - "33ab5639bfd8e7b95eb1d8d0b87781d4ffea4d5d", # README v1 - "349c4ff7d21f1ec0eda26f3d9284c293e3425417", # README v2 - "799c11e348d39f1704022b8354502e2f81f3c037", # file1.txt - "4bdb40dfd6ec75cb730e678b5d7786e30170c5fb", # file2.txt -} SNAPSHOT_ID = "a23699280a82a043f8c0994cf1631b568f716f95" @@ -105,119 +91,77 @@ } -class BaseGitLoaderFromDiskTest(BaseLoaderTest): - def setUp(self, archive_name, uncompress_archive, filename="testrepo"): - super().setUp( - archive_name=archive_name, - filename=filename, - prefix_tmp_folder_name="swh.loader.git.", - start_path=os.path.dirname(__file__), - uncompress_archive=uncompress_archive, - ) - - -class GitLoaderFromDiskTest(OrigGitLoaderFromDisk): - def parse_config_file(self, *args, **kwargs): - return TEST_LOADER_CONFIG - - -class BaseDirGitLoaderFromDiskTest(BaseGitLoaderFromDiskTest): - """Mixin base loader test to prepare the git - repository to uncompress, load and test the results. - - This sets up - - """ - - def setUp(self): - super().setUp("testrepo.tgz", uncompress_archive=True) - self.loader = GitLoaderFromDiskTest( - url=self.repo_url, - visit_date=datetime.datetime( - 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc - ), - directory=self.destination_path, - ) - self.storage = self.loader.storage - self.repo = dulwich.repo.Repo(self.destination_path) - - def load(self): - return self.loader.load() - - -class BaseGitLoaderFromArchiveTest(BaseGitLoaderFromDiskTest): - """Mixin base loader test to prepare the git - repository to uncompress, load and test the results. - - This sets up - - """ - - def setUp(self): - super().setUp("testrepo.tgz", uncompress_archive=False) - self.loader = GitLoaderFromArchive( - url=self.repo_url, - visit_date=datetime.datetime( - 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc - ), - archive_path=self.destination_path, - ) - self.storage = self.loader.storage - - def load(self): - return self.loader.load() - - -class GitLoaderFromDiskTests: +class CommonGitLoaderTests: """Common tests for all git loaders.""" def test_load(self): """Loads a simple repository (made available by `setUp()`), and checks everything was added in the storage.""" - res = self.load() - self.assertEqual(res["status"], "eventful", res) - - self.assertContentsContain(CONTENT1) - self.assertCountDirectories(7) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7) - self.assertCountSnapshots(1) - - self.assertRevisionsContain(REVISIONS1) + res = self.loader.load() - self.assertSnapshotEqual(SNAPSHOT1) - - self.assertEqual(self.loader.load_status(), {"status": "eventful"}) - self.assertEqual(self.loader.visit_status(), "full") + assert res == {"status": "eventful"} assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", snapshot=hash_to_bytes(SNAPSHOT1["id"]), ) + stats = get_stats(self.loader.storage) + assert stats == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + + check_snapshot(SNAPSHOT1, self.loader.storage) + def test_load_unchanged(self): """Checks loading a repository a second time does not add any extra data.""" - res = self.load() - self.assertEqual(res["status"], "eventful") + res = self.loader.load() + assert res == {"status": "eventful"} assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", snapshot=hash_to_bytes(SNAPSHOT1["id"]), ) - res = self.load() - self.assertEqual(res["status"], "uneventful") - self.assertCountSnapshots(1) + stats0 = get_stats(self.loader.storage) + assert stats0 == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + + res = self.loader.load() + assert res == {"status": "uneventful"} + stats1 = get_stats(self.loader.storage) + expected_stats = copy.deepcopy(stats0) + expected_stats["origin_visit"] += 1 + assert stats1 == expected_stats + + check_snapshot(SNAPSHOT1, self.loader.storage) assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", @@ -225,17 +169,32 @@ ) -class DirGitLoaderTest(BaseDirGitLoaderFromDiskTest, GitLoaderFromDiskTests): - """Tests for the GitLoaderFromDisk. Includes the common ones, and - add others that only work with a local dir.""" +class FullGitLoaderTests(CommonGitLoaderTests): + """Tests for GitLoader (from disk or not). Includes the common ones, and + add others that only work with a local dir. + + """ def test_load_changed(self): """Loads a repository, makes some changes by adding files, commits, and merges, load it again, and check the storage contains everything it should.""" # Initial load - res = self.load() - self.assertEqual(res["status"], "eventful", res) + res = self.loader.load() + assert res == {"status": "eventful"} + + stats0 = get_stats(self.loader.storage) + assert stats0 == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } # Load with a new file + revision with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: @@ -251,27 +210,30 @@ assert new_revision not in revisions revisions[new_revision] = new_dir - res = self.load() - self.assertEqual(res["status"], "eventful") + res = self.loader.load() + assert res == {"status": "eventful"} - self.assertCountContents(4 + 1) - self.assertCountDirectories(7 + 1) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7 + 1) - self.assertCountSnapshots(1 + 1) + stats1 = get_stats(self.loader.storage) + expected_stats = copy.deepcopy(stats0) + # did one new visit + expected_stats["origin_visit"] += 1 + # with one more of the following objects + expected_stats["person"] += 1 + expected_stats["content"] += 1 + expected_stats["directory"] += 1 + expected_stats["revision"] += 1 + # concluding into 1 new snapshot + expected_stats["snapshot"] += 1 - self.assertRevisionsContain(revisions) - - self.assertEqual(self.loader.load_status(), {"status": "eventful"}) - self.assertEqual(self.loader.visit_status(), "full") + assert stats1 == expected_stats visit_status = assert_last_visit_matches( - self.storage, self.repo_url, status="full", type="git" + self.loader.storage, self.repo_url, status="full", type="git" ) - self.assertIsNotNone(visit_status.snapshot) + assert visit_status.snapshot is not None snapshot_id = visit_status.snapshot - snapshot = self.storage.snapshot_get(snapshot_id) + snapshot = self.loader.storage.snapshot_get(snapshot_id) branches = snapshot["branches"] assert branches[b"HEAD"] == { "target": b"refs/heads/master", @@ -304,29 +266,30 @@ assert merge_commit.decode() not in revisions revisions[merge_commit.decode()] = merged_tree.id.decode() - res = self.load() - self.assertEqual(res["status"], "eventful") - - self.assertCountContents(4 + 1) - self.assertCountDirectories(7 + 2) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7 + 2) - self.assertCountSnapshots(1 + 1 + 1) + res = self.loader.load() + assert res == {"status": "eventful"} - self.assertRevisionsContain(revisions) + stats2 = get_stats(self.loader.storage) + expected_stats = copy.deepcopy(stats1) + # one more visit + expected_stats["origin_visit"] += 1 + # with 1 new directory and revision + expected_stats["directory"] += 1 + expected_stats["revision"] += 1 + # concluding into 1 new snapshot + expected_stats["snapshot"] += 1 - self.assertEqual(self.loader.load_status(), {"status": "eventful"}) - self.assertEqual(self.loader.visit_status(), "full") + assert stats2 == expected_stats visit_status = assert_last_visit_matches( - self.storage, self.repo_url, status="full", type="git" + self.loader.storage, self.repo_url, status="full", type="git" ) - self.assertIsNotNone(visit_status.snapshot) + assert visit_status.snapshot is not None merge_snapshot_id = visit_status.snapshot assert merge_snapshot_id != snapshot_id - merge_snapshot = self.storage.snapshot_get(merge_snapshot_id) + merge_snapshot = self.loader.storage.snapshot_get(merge_snapshot_id) merge_branches = merge_snapshot["branches"] assert merge_branches[b"HEAD"] == { "target": b"refs/heads/master", @@ -372,14 +335,11 @@ expected_snapshot = Snapshot(branches=branches) # Load the modified repository - res = self.load() - assert res["status"] == "eventful" - - assert self.loader.load_status() == {"status": "eventful"} - assert self.loader.visit_status() == "full" + res = self.loader.load() + assert res == {"status": "eventful"} assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", @@ -390,22 +350,16 @@ with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f: f.write(b"ref: refs/heads/dangling-branch\n") - res = self.load() - self.assertEqual(res["status"], "eventful", res) - - self.assertContentsContain(CONTENT1) - self.assertCountDirectories(7) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7) - self.assertCountSnapshots(1) + res = self.loader.load() + assert res == {"status": "eventful"} visit_status = assert_last_visit_matches( - self.storage, self.repo_url, status="full", type="git" + self.loader.storage, self.repo_url, status="full", type="git" ) snapshot_id = visit_status.snapshot assert snapshot_id is not None - snapshot = self.storage.snapshot_get(snapshot_id) + snapshot = self.loader.storage.snapshot_get(snapshot_id) branches = snapshot["branches"] assert branches[b"HEAD"] == { @@ -414,9 +368,57 @@ } assert branches[b"refs/heads/dangling-branch"] is None + stats = get_stats(self.loader.storage) + assert stats == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + -class GitLoaderFromArchiveTest(BaseGitLoaderFromArchiveTest, GitLoaderFromDiskTests): - """Tests for GitLoaderFromArchive. Imports the common ones - from GitLoaderTests.""" +class GitLoaderFromDiskTest(TestCase, FullGitLoaderTests): + """Prepare a git directory repository to be loaded through a GitLoaderFromDisk. + This tests all git loader scenario. - pass + """ + + @pytest.fixture(autouse=True) + def init(self, swh_config, datadir, tmp_path): + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = str(tmp_path) + self.repo_url = prepare_repository_from_archive( + archive_path, archive_name, tmp_path=tmp_path + ) + self.destination_path = os.path.join(tmp_path, archive_name) + self.loader = GitLoaderFromDisk( + url=self.repo_url, + visit_date=datetime.datetime( + 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc + ), + directory=self.destination_path, + ) + self.repo = dulwich.repo.Repo(self.destination_path) + + +class GitLoaderFromArchiveTest(TestCase, CommonGitLoaderTests): + """Tests for GitLoaderFromArchive. Only tests common scenario.""" + + @pytest.fixture(autouse=True) + def init(self, swh_config, datadir, tmp_path): + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + self.repo_url = archive_path + self.loader = GitLoaderFromArchive( + url=self.repo_url, + archive_path=archive_path, + visit_date=datetime.datetime( + 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc + ), + ) diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -3,24 +3,34 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.loader.git.loader import GitLoader -from swh.loader.git.tests.test_from_disk import DirGitLoaderTest +import os -from . import TEST_LOADER_CONFIG +import pytest +import dulwich.repo +from unittest import TestCase -class GitLoaderTest(GitLoader): - def parse_config_file(self, *args, **kwargs): - return {**super().parse_config_file(*args, **kwargs), **TEST_LOADER_CONFIG} +from swh.loader.git.loader import GitLoader +from swh.loader.git.tests.test_from_disk import FullGitLoaderTests +from swh.loader.git.tests import prepare_repository_from_archive -class TestGitLoader(DirGitLoaderTest): - """Same tests as for the GitLoaderFromDisk, but running on GitLoader.""" - def setUp(self): - super().setUp() - self.loader = GitLoaderTest(self.repo_url) - self.storage = self.loader.storage +class GitLoaderTest(TestCase, FullGitLoaderTests): + """Prepare a git directory repository to be loaded through a GitLoader. + This tests all git loader scenario. - def load(self): - return self.loader.load() + """ + + @pytest.fixture(autouse=True) + def init(self, swh_config, datadir, tmp_path): + super().setUp() + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = str(tmp_path) + self.repo_url = prepare_repository_from_archive( + archive_path, archive_name, tmp_path=tmp_path + ) + self.destination_path = os.path.join(tmp_path, archive_name) + self.loader = GitLoader(self.repo_url) + self.repo = dulwich.repo.Repo(self.destination_path) diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ # https://github.com/pypa/pip/issues/6239 # TODO: remove when this issue is fixed swh.core[http] >= 0.0.61 + swh.storage[testing] pytest-cov commands = pytest --cov={envsitepackagesdir}/swh/loader/git \