diff --git a/MANIFEST.in b/MANIFEST.in index a2941af..6bf3aa0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,10 @@ +include conftest.py include Makefile include Makefile.local include README.db_testing include README.dev include requirements*.txt include version.txt recursive-include sql *.sql recursive-include swh/vault/sql *.sql recursive-include swh py.typed diff --git a/PKG-INFO b/PKG-INFO index db3fbca..92b3b0b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,31 +1,31 @@ Metadata-Version: 2.1 Name: swh.vault -Version: 0.1.0 +Version: 0.2.0 Summary: Software Heritage vault Home-page: https://forge.softwareheritage.org/diffusion/DVAU/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-vault Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-vault/ Description: swh-vault ========= User-facing service that allows to retrieve parts of the archive as self-contained bundles. See the [documentation](https://docs.softwareheritage.org/devel/swh-vault/index.html) for more details. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..92cc362 --- /dev/null +++ b/conftest.py @@ -0,0 +1,6 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +pytest_plugins = ["swh.storage.pytest_plugin", "swh.loader.pytest_plugin"] diff --git a/requirements-swh.txt b/requirements-swh.txt index f1b77b7..d9966ae 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core[db,http] >= 0.3 swh.model >= 0.3 swh.objstorage >= 0.0.17 -swh.scheduler >= 0.0.39 +swh.scheduler >= 0.7.0 swh.storage >= 0.0.106 diff --git a/requirements-test.txt b/requirements-test.txt index 7e08b30..078a4e3 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,6 +1,7 @@ pytest pytest-aiohttp pytest-postgresql dulwich >= 0.18.7 +swh.loader.core swh.loader.git >= 0.0.52 swh.storage[testing] diff --git a/swh.vault.egg-info/PKG-INFO b/swh.vault.egg-info/PKG-INFO index db3fbca..92b3b0b 100644 --- a/swh.vault.egg-info/PKG-INFO +++ b/swh.vault.egg-info/PKG-INFO @@ -1,31 +1,31 @@ Metadata-Version: 2.1 Name: swh.vault -Version: 0.1.0 +Version: 0.2.0 Summary: Software Heritage vault Home-page: https://forge.softwareheritage.org/diffusion/DVAU/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-vault Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-vault/ Description: swh-vault ========= User-facing service that allows to retrieve parts of the archive as self-contained bundles. See the [documentation](https://docs.softwareheritage.org/devel/swh-vault/index.html) for more details. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.vault.egg-info/SOURCES.txt b/swh.vault.egg-info/SOURCES.txt index b4d719c..858be50 100644 --- a/swh.vault.egg-info/SOURCES.txt +++ b/swh.vault.egg-info/SOURCES.txt @@ -1,63 +1,64 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.md +conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/api.rst docs/conf.py docs/getting-started.rst docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder sql/upgrades/002.sql sql/upgrades/003.sql swh/__init__.py swh.vault.egg-info/PKG-INFO swh.vault.egg-info/SOURCES.txt swh.vault.egg-info/dependency_links.txt swh.vault.egg-info/entry_points.txt swh.vault.egg-info/not-zip-safe swh.vault.egg-info/requires.txt swh.vault.egg-info/top_level.txt swh/vault/__init__.py swh/vault/backend.py swh/vault/cache.py swh/vault/cli.py swh/vault/cooking_tasks.py swh/vault/exc.py swh/vault/py.typed swh/vault/to_disk.py swh/vault/api/__init__.py swh/vault/api/client.py swh/vault/api/server.py swh/vault/cookers/__init__.py swh/vault/cookers/base.py swh/vault/cookers/directory.py swh/vault/cookers/revision_flat.py swh/vault/cookers/revision_gitfast.py swh/vault/cookers/utils.py swh/vault/sql/30-swh-schema.sql swh/vault/tests/__init__.py swh/vault/tests/conftest.py swh/vault/tests/test_backend.py swh/vault/tests/test_cache.py swh/vault/tests/test_cookers.py swh/vault/tests/test_cookers_base.py swh/vault/tests/test_server.py swh/vault/tests/test_to_disk.py swh/vault/tests/vault_testing.py \ No newline at end of file diff --git a/swh.vault.egg-info/requires.txt b/swh.vault.egg-info/requires.txt index 3a1d82d..7ce4bfe 100644 --- a/swh.vault.egg-info/requires.txt +++ b/swh.vault.egg-info/requires.txt @@ -1,18 +1,19 @@ click flask psycopg2 python-dateutil fastimport swh.core[db,http]>=0.3 swh.model>=0.3 swh.objstorage>=0.0.17 -swh.scheduler>=0.0.39 +swh.scheduler>=0.7.0 swh.storage>=0.0.106 [testing] pytest pytest-aiohttp pytest-postgresql dulwich>=0.18.7 +swh.loader.core swh.loader.git>=0.0.52 swh.storage[testing] diff --git a/swh/vault/api/server.py b/swh/vault/api/server.py index 8837fa2..6c178e0 100644 --- a/swh/vault/api/server.py +++ b/swh/vault/api/server.py @@ -1,236 +1,233 @@ # Copyright (C) 2016-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import asyncio import collections import os import aiohttp.web from swh.core import config from swh.core.api.asynchronous import RPCServerApp, decode_request from swh.core.api.asynchronous import encode_data_server as encode_data from swh.model import hashutil from swh.vault import get_vault from swh.vault.backend import NotFoundExc from swh.vault.cookers import COOKER_TYPES DEFAULT_CONFIG_PATH = "vault/server" DEFAULT_CONFIG = { "storage": ("dict", {"cls": "remote", "args": {"url": "http://localhost:5002/",},}), "cache": ( "dict", { "cls": "pathslicing", "args": {"root": "/srv/softwareheritage/vault", "slicing": "0:1/1:5",}, }, ), "client_max_size": ("int", 1024 ** 3), "vault": ( "dict", {"cls": "local", "args": {"db": "dbname=softwareheritage-vault-dev",},}, ), - "scheduler": ( - "dict", - {"cls": "remote", "args": {"url": "http://localhost:5008/",}}, - ), + "scheduler": ("dict", {"cls": "remote", "url": "http://localhost:5008/",},), } @asyncio.coroutine def index(request): return aiohttp.web.Response(body="SWH Vault API server") # Web API endpoints @asyncio.coroutine def vault_fetch(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] if not request.app["backend"].is_available(obj_type, obj_id): raise NotFoundExc(f"{obj_type} {obj_id} is not available.") return encode_data(request.app["backend"].fetch(obj_type, obj_id)) def user_info(task_info): return { "id": task_info["id"], "status": task_info["task_status"], "progress_message": task_info["progress_msg"], "obj_type": task_info["type"], "obj_id": hashutil.hash_to_hex(task_info["object_id"]), } @asyncio.coroutine def vault_cook(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] email = request.query.get("email") sticky = request.query.get("sticky") in ("true", "1") if obj_type not in COOKER_TYPES: raise NotFoundExc(f"{obj_type} is an unknown type.") info = request.app["backend"].cook_request( obj_type, obj_id, email=email, sticky=sticky ) # TODO: return 201 status (Created) once the api supports it return encode_data(user_info(info)) @asyncio.coroutine def vault_progress(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] info = request.app["backend"].task_info(obj_type, obj_id) if not info: raise NotFoundExc(f"{obj_type} {obj_id} was not found.") return encode_data(user_info(info)) # Cookers endpoints @asyncio.coroutine def set_progress(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] progress = yield from decode_request(request) request.app["backend"].set_progress(obj_type, obj_id, progress) return encode_data(True) # FIXME: success value? @asyncio.coroutine def set_status(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] status = yield from decode_request(request) request.app["backend"].set_status(obj_type, obj_id, status) return encode_data(True) # FIXME: success value? @asyncio.coroutine def put_bundle(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] # TODO: handle streaming properly content = yield from decode_request(request) request.app["backend"].cache.add(obj_type, obj_id, content) return encode_data(True) # FIXME: success value? @asyncio.coroutine def send_notif(request): obj_type = request.match_info["type"] obj_id = request.match_info["id"] request.app["backend"].send_all_notifications(obj_type, obj_id) return encode_data(True) # FIXME: success value? # Batch endpoints @asyncio.coroutine def batch_cook(request): batch = yield from decode_request(request) for obj_type, obj_id in batch: if obj_type not in COOKER_TYPES: raise NotFoundExc(f"{obj_type} is an unknown type.") batch_id = request.app["backend"].batch_cook(batch) return encode_data({"id": batch_id}) @asyncio.coroutine def batch_progress(request): batch_id = request.match_info["batch_id"] bundles = request.app["backend"].batch_info(batch_id) if not bundles: raise NotFoundExc(f"Batch {batch_id} does not exist.") bundles = [user_info(bundle) for bundle in bundles] counter = collections.Counter(b["status"] for b in bundles) res = { "bundles": bundles, "total": len(bundles), **{k: 0 for k in ("new", "pending", "done", "failed")}, **dict(counter), } return encode_data(res) # Web server def make_app(backend, **kwargs): app = RPCServerApp(**kwargs) app.router.add_route("GET", "/", index) app.client_exception_classes = (NotFoundExc,) # Endpoints used by the web API app.router.add_route("GET", "/fetch/{type}/{id}", vault_fetch) app.router.add_route("POST", "/cook/{type}/{id}", vault_cook) app.router.add_route("GET", "/progress/{type}/{id}", vault_progress) # Endpoints used by the Cookers app.router.add_route("POST", "/set_progress/{type}/{id}", set_progress) app.router.add_route("POST", "/set_status/{type}/{id}", set_status) app.router.add_route("POST", "/put_bundle/{type}/{id}", put_bundle) app.router.add_route("POST", "/send_notif/{type}/{id}", send_notif) # Endpoints for batch requests app.router.add_route("POST", "/batch_cook", batch_cook) app.router.add_route("GET", "/batch_progress/{batch_id}", batch_progress) app["backend"] = backend return app def get_local_backend(cfg): if "vault" not in cfg: raise ValueError("missing '%vault' configuration") vcfg = cfg["vault"] if vcfg["cls"] != "local": raise EnvironmentError( "The vault backend can only be started with a 'local' " "configuration", err=True, ) args = vcfg["args"] if "cache" not in args: args["cache"] = cfg.get("cache") if "storage" not in args: args["storage"] = cfg.get("storage") if "scheduler" not in args: args["scheduler"] = cfg.get("scheduler") for key in ("cache", "storage", "scheduler"): if not args.get(key): raise ValueError("invalid configuration; missing %s config entry." % key) return get_vault("local", args) def make_app_from_configfile(config_file=None, **kwargs): if config_file is None: config_file = DEFAULT_CONFIG_PATH config_file = os.environ.get("SWH_CONFIG_FILENAME", config_file) if os.path.isfile(config_file): cfg = config.read(config_file, DEFAULT_CONFIG) else: cfg = config.load_named_config(config_file, DEFAULT_CONFIG) vault = get_local_backend(cfg) return make_app(backend=vault, client_max_size=cfg["client_max_size"], **kwargs) if __name__ == "__main__": print("Deprecated. Use swh-vault ") diff --git a/swh/vault/cli.py b/swh/vault/cli.py index 7378f49..f3e1279 100644 --- a/swh/vault/cli.py +++ b/swh/vault/cli.py @@ -1,85 +1,86 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # WARNING: do not import unnecessary things here to keep cli startup time under # control import logging import click -from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup, swh as swh_cli_group +from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup +from swh.core.cli import swh as swh_cli_group CFG_HELP = """Software Heritage Vault RPC server.""" @swh_cli_group.group(name="vault", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.pass_context def vault(ctx): """Software Heritage Vault tools.""" pass @vault.command(name="rpc-serve", help=CFG_HELP) @click.option( "--config-file", "-C", default=None, metavar="CONFIGFILE", type=click.Path(exists=True, dir_okay=False,), help="Configuration file.", ) @click.option( "--no-stdout", is_flag=True, default=False, help="Do NOT output logs on the console" ) @click.option( "--host", default="0.0.0.0", metavar="IP", show_default=True, help="Host ip address to bind the server on", ) @click.option( "--port", default=5005, type=click.INT, metavar="PORT", help="Binding port of the server", ) @click.option( "--debug/--no-debug", default=True, help="Indicates if the server should run in debug mode", ) @click.pass_context def serve(ctx, config_file, no_stdout, host, port, debug): import aiohttp from swh.scheduler.celery_backend.config import setup_log_handler from swh.vault.api.server import make_app_from_configfile ctx.ensure_object(dict) setup_log_handler( loglevel=ctx.obj.get("log_level", logging.INFO), colorize=False, format="[%(levelname)s] %(name)s -- %(message)s", log_console=not no_stdout, ) try: app = make_app_from_configfile(config_file, debug=debug) except EnvironmentError as e: click.echo(e.msg, err=True) ctx.exit(1) aiohttp.web.run_app(app, host=host, port=int(port)) def main(): logging.basicConfig() return serve(auto_envvar_prefix="SWH_VAULT") if __name__ == "__main__": main() diff --git a/swh/vault/tests/conftest.py b/swh/vault/tests/conftest.py index bbdb652..9090e46 100644 --- a/swh/vault/tests/conftest.py +++ b/swh/vault/tests/conftest.py @@ -1,88 +1,88 @@ import glob import os import subprocess import pkg_resources.extern.packaging.version import pytest from pytest_postgresql import factories from swh.core.utils import numfile_sortkey as sortkey from swh.storage.tests import SQL_DIR as STORAGE_SQL_DIR from swh.vault import get_vault from swh.vault.tests import SQL_DIR os.environ["LC_ALL"] = "C.UTF-8" pytest_v = pkg_resources.get_distribution("pytest").parsed_version if pytest_v < pkg_resources.extern.packaging.version.parse("3.9"): @pytest.fixture def tmp_path(request): import pathlib import tempfile with tempfile.TemporaryDirectory() as tmpdir: yield pathlib.Path(tmpdir) def db_url(name, postgresql_proc): return "postgresql://{user}@{host}:{port}/{dbname}".format( host=postgresql_proc.host, port=postgresql_proc.port, user="postgres", dbname=name, ) postgresql2 = factories.postgresql("postgresql_proc", "tests2") @pytest.fixture def swh_vault(request, postgresql_proc, postgresql, postgresql2, tmp_path): for sql_dir, pg in ((SQL_DIR, postgresql), (STORAGE_SQL_DIR, postgresql2)): dump_files = os.path.join(sql_dir, "*.sql") all_dump_files = sorted(glob.glob(dump_files), key=sortkey) for fname in all_dump_files: subprocess.check_call( [ "psql", "--quiet", "--no-psqlrc", "-v", "ON_ERROR_STOP=1", "-d", pg.dsn, "-f", fname, ] ) vault_config = { "db": db_url("tests", postgresql_proc), "storage": { "cls": "local", "db": db_url("tests2", postgresql_proc), "objstorage": { "cls": "pathslicing", "args": {"root": str(tmp_path), "slicing": "0:1/1:5",}, }, }, "cache": { "cls": "pathslicing", "args": { "root": str(tmp_path), "slicing": "0:1/1:5", "allow_delete": True, }, }, - "scheduler": {"cls": "remote", "args": {"url": "http://swh-scheduler:5008",},}, + "scheduler": {"cls": "remote", "url": "http://swh-scheduler:5008",}, } return get_vault("local", vault_config) @pytest.fixture def swh_storage(swh_vault): return swh_vault.storage diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py index b05e381..714bfda 100644 --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -1,559 +1,575 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import datetime import gzip import io import os import pathlib import subprocess import tarfile import tempfile import unittest import unittest.mock import dulwich.fastexport import dulwich.index import dulwich.objects import dulwich.porcelain import dulwich.repo +import pytest from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model import from_disk, hashutil from swh.model.model import Directory, DirectoryEntry, Person, Revision, RevisionType from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker from swh.vault.tests.vault_testing import hash_content from swh.vault.to_disk import HIDDEN_MESSAGE, SKIPPED_MESSAGE class TestRepo: """A tiny context manager for a test git repository, with some utility functions to perform basic git stuff. """ def __enter__(self): self.tmp_dir = tempfile.TemporaryDirectory(prefix="tmp-vault-repo-") self.repo_dir = self.tmp_dir.__enter__() self.repo = dulwich.repo.Repo.init(self.repo_dir) self.author_name = b"Test Author" self.author_email = b"test@softwareheritage.org" self.author = b"%s <%s>" % (self.author_name, self.author_email) self.base_date = 258244200 self.counter = 0 return pathlib.Path(self.repo_dir) def __exit__(self, exc, value, tb): self.tmp_dir.__exit__(exc, value, tb) def checkout(self, rev_sha): rev = self.repo[rev_sha] dulwich.index.build_index_from_tree( self.repo_dir, self.repo.index_path(), self.repo.object_store, rev.tree ) def git_shell(self, *cmd, stdout=subprocess.DEVNULL, **kwargs): name = self.author_name email = self.author_email date = "%d +0000" % (self.base_date + self.counter) env = { # Set git commit format "GIT_AUTHOR_NAME": name, "GIT_AUTHOR_EMAIL": email, "GIT_AUTHOR_DATE": date, "GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "GIT_COMMITTER_DATE": date, # Ignore all the system-wide and user configurations "GIT_CONFIG_NOSYSTEM": "1", "HOME": str(self.tmp_dir), "XDG_CONFIG_HOME": str(self.tmp_dir), } kwargs.setdefault("env", {}).update(env) subprocess.check_call( ("git", "-C", self.repo_dir) + cmd, stdout=stdout, **kwargs ) def commit(self, message="Commit test\n", ref=b"HEAD"): """Commit the current working tree in a new commit with message on the branch 'ref'. At the end of the commit, the reference should stay the same and the index should be clean. """ self.git_shell("add", ".") message = message.encode() + b"\n" ret = self.repo.do_commit( message=message, committer=self.author, commit_timestamp=self.base_date + self.counter, commit_timezone=0, ref=ref, ) self.counter += 1 # committing on another branch leaves # dangling files in index if ref != b"HEAD": # XXX this should work (but does not) # dulwich.porcelain.reset(self.repo, 'hard') self.git_shell("reset", "--hard", "HEAD") return ret def merge(self, parent_sha_list, message="Merge branches."): self.git_shell( "merge", "--allow-unrelated-histories", "-m", message, *[p.decode() for p in parent_sha_list], ) self.counter += 1 return self.repo.refs[b"HEAD"] def print_debug_graph(self, reflog=False): args = ["log", "--all", "--graph", "--decorate"] if reflog: args.append("--reflog") self.git_shell(*args, stdout=None) +@pytest.fixture +def swh_loader_config(swh_loader_config): + swh_loader_config["max_content_size"] = 100 * 1024 * 1024 + return swh_loader_config + + +@pytest.fixture def git_loader( - storage, repo_path, visit_date=datetime.datetime.now(datetime.timezone.utc) + swh_storage, swh_loader_config, ): """Instantiate a Git Loader using the storage instance as storage. """ - loader = GitLoaderFromDisk( - "fake_origin", directory=repo_path, visit_date=visit_date - ) - loader.storage = storage - return loader + + def _create_loader(directory): + loader = GitLoaderFromDisk( + "fake_origin", + directory=directory, + visit_date=datetime.datetime.now(datetime.timezone.utc), + config=swh_loader_config, + ) + loader.storage = swh_storage + return loader + + return _create_loader @contextlib.contextmanager def cook_extract_directory(storage, obj_id): """Context manager that cooks a directory and extract it.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = DirectoryCooker("directory", obj_id, backend=backend, storage=storage) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) with tempfile.TemporaryDirectory(prefix="tmp-vault-extract-") as td: with tarfile.open(fileobj=cooker.fileobj, mode="r") as tar: tar.extractall(td) yield pathlib.Path(td) / hashutil.hash_to_hex(obj_id) cooker.storage = None @contextlib.contextmanager def cook_stream_revision_gitfast(storage, obj_id): """Context manager that cooks a revision and stream its fastexport.""" backend = unittest.mock.MagicMock() backend.storage = storage cooker = RevisionGitfastCooker( "revision_gitfast", obj_id, backend=backend, storage=storage ) cooker.fileobj = io.BytesIO() assert cooker.check_exists() cooker.prepare_bundle() cooker.fileobj.seek(0) fastexport_stream = gzip.GzipFile(fileobj=cooker.fileobj) yield fastexport_stream cooker.storage = None @contextlib.contextmanager def cook_extract_revision_gitfast(storage, obj_id): """Context manager that cooks a revision and extract it.""" test_repo = TestRepo() with cook_stream_revision_gitfast(storage, obj_id) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) yield test_repo, p TEST_CONTENT = ( " test content\n" "and unicode \N{BLACK HEART SUIT}\n" " and trailing spaces " ) TEST_EXECUTABLE = b"\x42\x40\x00\x00\x05" class TestDirectoryCooker: - def test_directory_simple(self, swh_storage): + def test_directory_simple(self, git_loader): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_directory(swh_storage, obj_id) as p: + with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash) - def test_directory_filtered_objects(self, swh_storage): + def test_directory_filtered_objects(self, git_loader): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) c = repo.commit() - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) # FIXME: storage.content_update() should be changed to allow things # like that - with swh_storage.get_db().transaction() as cur: + with loader.storage.get_db().transaction() as cur: cur.execute( """update content set status = 'visible' where sha1 = %s""", (id_1,), ) cur.execute( """update content set status = 'hidden' where sha1 = %s""", (id_2,), ) cur.execute( """update content set status = 'absent' where sha1 = %s""", (id_3,), ) - with cook_extract_directory(swh_storage, obj_id) as p: + with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_directory_bogus_perms(self, swh_storage): + def test_directory_bogus_perms(self, git_loader): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the directory # cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "file").chmod(0o664) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o775) (rp / "wat").write_text(TEST_CONTENT) (rp / "wat").chmod(0o604) c = repo.commit() - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_directory(swh_storage, obj_id) as p: + with cook_extract_directory(loader.storage, obj_id) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 def test_directory_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) with cook_extract_directory(swh_storage, dir.id) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev class TestRevisionGitfastCooker: - def test_revision_simple(self, swh_storage): + def test_revision_simple(self, git_loader): # # 1--2--3--4--5--6--7 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("add file2") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) repo.commit("add dir1/dir2/file") (rp / "bin1").write_bytes(TEST_EXECUTABLE) (rp / "bin1").chmod(0o755) repo.commit("add bin1") (rp / "link1").symlink_to("file1") repo.commit("link link1 to file1") (rp / "file2").unlink() repo.commit("remove file2") (rp / "bin1").rename(rp / "bin") repo.commit("rename bin1 to bin") - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(swh_storage, obj_id) as (ert, p): + with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file1").stat().st_mode == 0o100644 assert (p / "file1").read_text() == TEST_CONTENT assert (p / "link1").is_symlink assert os.readlink(str(p / "link1")) == "file1" assert (p / "bin").stat().st_mode == 0o100755 assert (p / "bin").read_bytes() == TEST_EXECUTABLE assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_roots(self, swh_storage): + def test_revision_two_roots(self, git_loader): # # 1----3---4 # / # 2---- # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") del repo.repo.refs[b"refs/heads/master"] # git update-ref -d HEAD (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") repo.merge([c1]) (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3") obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(swh_storage, obj_id) as (ert, p): + with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_two_double_fork_merge(self, swh_storage): + def test_revision_two_double_fork_merge(self, git_loader): # # 2---4---6 # / / / # 1---3---5 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") repo.repo.refs[b"refs/heads/c1"] = c1 (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") (rp / "file3").write_text(TEST_CONTENT) c3 = repo.commit("Add file3", ref=b"refs/heads/c1") repo.repo.refs[b"refs/heads/c3"] = c3 repo.merge([c3]) (rp / "file5").write_text(TEST_CONTENT) c5 = repo.commit("Add file3", ref=b"refs/heads/c3") repo.merge([c5]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(swh_storage, obj_id) as (ert, p): + with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_triple_merge(self, swh_storage): + def test_revision_triple_merge(self, git_loader): # # .---.---5 # / / / # 2 3 4 # / / / # 1---.---. # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Commit 1") repo.repo.refs[b"refs/heads/b1"] = c1 repo.repo.refs[b"refs/heads/b2"] = c1 repo.commit("Commit 2") c3 = repo.commit("Commit 3", ref=b"refs/heads/b1") c4 = repo.commit("Commit 4", ref=b"refs/heads/b2") repo.merge([c3, c4]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() - with cook_extract_revision_gitfast(swh_storage, obj_id) as (ert, p): + with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): assert ert.repo.refs[b"HEAD"].decode() == obj_id_hex - def test_revision_filtered_objects(self, swh_storage): + def test_revision_filtered_objects(self, git_loader): repo = TestRepo() with repo as rp: file_1, id_1 = hash_content(b"test1") file_2, id_2 = hash_content(b"test2") file_3, id_3 = hash_content(b"test3") (rp / "file").write_bytes(file_1) (rp / "hidden_file").write_bytes(file_2) (rp / "absent_file").write_bytes(file_3) repo.commit() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() # FIXME: storage.content_update() should be changed to allow things # like that - with swh_storage.get_db().transaction() as cur: + with loader.storage.get_db().transaction() as cur: cur.execute( """update content set status = 'visible' where sha1 = %s""", (id_1,), ) cur.execute( """update content set status = 'hidden' where sha1 = %s""", (id_2,), ) cur.execute( """update content set status = 'absent' where sha1 = %s""", (id_3,), ) - with cook_extract_revision_gitfast(swh_storage, obj_id) as (ert, p): + with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").read_bytes() == b"test1" assert (p / "hidden_file").read_bytes() == HIDDEN_MESSAGE assert (p / "absent_file").read_bytes() == SKIPPED_MESSAGE - def test_revision_bogus_perms(self, swh_storage): + def test_revision_bogus_perms(self, git_loader): # Some early git repositories have 664/775 permissions... let's check # if all the weird modes are properly normalized in the revision # cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "file").chmod(0o664) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o775) (rp / "wat").write_text(TEST_CONTENT) (rp / "wat").chmod(0o604) repo.commit("initial commit") - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) - with cook_extract_revision_gitfast(swh_storage, obj_id) as (ert, p): + with cook_extract_revision_gitfast(loader.storage, obj_id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 assert (p / "executable").stat().st_mode == 0o100755 assert (p / "wat").stat().st_mode == 0o100644 - def test_revision_null_fields(self, swh_storage): + def test_revision_null_fields(self, git_loader): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) c = repo.commit("initial commit") - loader = git_loader(swh_storage, str(rp)) + loader = git_loader(str(rp)) loader.load() repo.repo.refs[b"HEAD"].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_revision = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir_id, metadata={}, synthetic=True, ) - swh_storage.revision_add([test_revision]) + storage = loader.storage + storage.revision_add([test_revision]) - with cook_extract_revision_gitfast(swh_storage, test_revision.id) as (ert, p): + with cook_extract_revision_gitfast(storage, test_revision.id) as (ert, p): ert.checkout(b"HEAD") assert (p / "file").stat().st_mode == 0o100644 def test_revision_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory( entries=( DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_stream_revision_gitfast(swh_storage, rev.id) as stream: pattern = "M 160000 {} submodule".format(target_rev).encode() assert pattern in stream.read() diff --git a/tox.ini b/tox.ini index 04524da..f053ae8 100644 --- a/tox.ini +++ b/tox.ini @@ -1,34 +1,34 @@ [tox] envlist=black,flake8,mypy,py3 [testenv] extras = testing deps = pytest-cov commands = pytest --cov={envsitepackagesdir}/swh/vault \ {envsitepackagesdir}/swh/vault \ --cov-branch {posargs} [testenv:black] skip_install = true deps = - black + black==19.10b0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = flake8 commands = {envpython} -m flake8 [testenv:mypy] extras = testing deps = mypy commands = mypy swh