diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index df70797..dcdddf8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,54 +1,43 @@ repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 - hooks: - - id: trailing-whitespace - - id: check-json - - id: check-yaml + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: check-json + - id: check-yaml -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 - hooks: - - id: flake8 + - repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 -- repo: https://github.com/codespell-project/codespell - rev: v1.16.0 - hooks: - - id: codespell + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + name: Check source code spelling + args: [-L crate] + stages: [commit] + - id: codespell + name: Check commit message spelling + stages: [commit-msg] -- repo: local - hooks: - - id: mypy - name: mypy - entry: mypy - args: [swh] - pass_filenames: false - language: system - types: [python] + - repo: local + hooks: + - id: mypy + name: mypy + entry: mypy + args: [swh] + pass_filenames: false + language: system + types: [python] -- repo: https://github.com/PyCQA/isort - rev: 5.5.2 - hooks: - - id: isort + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort -- repo: https://github.com/python/black - rev: 19.10b0 - hooks: - - id: black - -# unfortunately, we are far from being able to enable this... -# - repo: https://github.com/PyCQA/pydocstyle.git -# rev: 4.0.0 -# hooks: -# - id: pydocstyle -# name: pydocstyle -# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. -# entry: pydocstyle --convention=google -# language: python -# types: [python] - -#- repo: https://github.com/asottile/blacken-docs -# rev: v1.0.0-1 -# hooks: -# - id: blacken-docs -# additional_dependencies: [black==19.3b0] + - repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black diff --git a/MANIFEST.in b/MANIFEST.in index 7db81e8..483b7ad 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,8 +1,9 @@ include Makefile include conftest.py include requirements*.txt include version.txt -recursive-include swh/core/sql *.sql +recursive-include swh/core/db/sql *.sql recursive-include swh py.typed +recursive-include swh/core/db/tests/data/ * recursive-include swh/core/tests/data/ * recursive-include swh/core/tests/fixture/data/ * diff --git a/PKG-INFO b/PKG-INFO index 69719e6..45e95ea 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,42 +1,42 @@ Metadata-Version: 2.1 Name: swh.core -Version: 1.1.1 +Version: 2.0.0 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-core/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing-core Provides-Extra: logging Provides-Extra: db Provides-Extra: http Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Core foundations ==================================== Low-level utilities and helpers used by almost all other modules in the stack. core library for swh's modules: - config parser - serialization - logging mechanism - database connection - http-based RPC client/server diff --git a/conftest.py b/conftest.py index 8b6908f..5f2c429 100644 --- a/conftest.py +++ b/conftest.py @@ -1,20 +1,20 @@ from hypothesis import settings import pytest -from swh.core.cli import swh as _swhmain - # define tests profile. Full documentation is at: # https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles settings.register_profile("fast", max_examples=5, deadline=5000) settings.register_profile("slow", max_examples=20, deadline=5000) @pytest.fixture def swhmain(): """Yield an instance of the main `swh` click command that cleans the added subcommands up on teardown.""" + from swh.core.cli import swh as _swhmain + commands = _swhmain.commands.copy() aliases = _swhmain.aliases.copy() yield _swhmain _swhmain.commands = commands _swhmain.aliases = aliases diff --git a/docs/db.rst b/docs/db.rst new file mode 100644 index 0000000..b17a9cb --- /dev/null +++ b/docs/db.rst @@ -0,0 +1,176 @@ +.. _swh-core-db: + +Common database utilities +========================= + +The ``swh.core.db`` module offers a set of common (postgresql) database +handling utilities and features for other swh packages implementing a +`datastore`, aka a service responsible for providing a data store via a common +interface which can use a postgresql database as backend. Examples are +:mod:`swh.storage` or :mod:`swh.scheduler`. + +Most of the time, this database-based data storage facility will depend on a data +schema (may be based in :mod:`swh.model` or not) and provide a unified interface +based on an Python class to abstract access to this datastore. + +Some packages may implement only a postgresql backend, some may provide more +backends. + +This :mod:`swh.core.db` only deals with the postgresql part and provides common +features and tooling to manage the database lifecycle in a consistent and +unified way among all the :mod:`swh` packages. + +It comes with a few command line tools to manage the specific :mod:`swh` +package database. + +As such, most of the database management cli commands require a configuration +file holding the database connection information. + +For example, for the :mod:`swh.storage` package, one will be able to create, +initialize and upgrade the postgresql database using simple commands. + +To create the database and perform superuser initialization steps (see below): + +.. code-block:: bash + + $ swh db create storage --dbname=postgresql://superuser:passwd@localhost:5433/test-storage + +If the database already exists but lacks superuser level initialization steps, +you may use: + +.. code-block:: bash + + $ swh db init-admin storage --dbname=postgresql://superuser:passwd@localhost:5433/test-storage + + +Then assuming the ``config.yml`` file existence: + +.. code-block:: yaml + + storage: + cls: postgresql + db: host=localhost, port=5433, dbname=test-storage, username=normal-user, password=pwd + objstorage: + cls: memory + +then you can run: + +.. code-block:: bash + + $ swh db --config-file=config.yml init storage + DONE database for storage initialized (flavor default) at version 182 + +Note: you can define the ``SWH_CONFIG_FILENAME`` environment variable instead +of using the ``--config-name`` command line option. + +or check the actual data model version of this database: + +.. code-block:: bash + + $ swh db --config-file=config.yml version storage + module: storage + flavor: default + version: 182 + +as well as the migration history for the database: + +.. code-block:: bash + + $ swh db --config-file=config.yml version --all storage + module: storage + flavor: default + 182 [2022-02-11 15:08:31.806070+01:00] Work In Progress + 181 [2022-02-11 14:06:27.435010+01:00] Work In Progress + + +The database migration is done using the ``swh db upgrade`` command. + + + +Implementation of a swh.core.db datastore +----------------------------------------- + +To use this database management tooling, in a :mod:`swh` package, the following +conditions are expected: + +- the package should provide an ``sql`` directory in its root namespace + providing initialization sql scripts. Scripts should be named like + ``nn-xxx.sql`` and are executed in order according to the ``nn`` integer + value. Scripts having ``-superuser-`` in their name will be executed by the + ``init-admin`` tool and are expected to require superuser access level, + whereas scripts without ``-superuser-`` in their name will be executed by the + ``swh db init`` command and are expected to require write access + level (with no need for superuser access level). + +- the package should provide a ``sql/upgrade`` directory with SQL migration + scripts in its root namespace. Script names are expected to be of the form + ``nnn.sql`` where `nnn` is the version to which this script does the + migration from a database at version `nnn - 1`. + +- the initialization and migration scripts should not create nor fill the + metadata related tables (``dbversion`` and ``dbmodule``). + +- the package should provide a ``get_datastore`` function in its root namespace + returning an instance of the datastore object. Normally, this datastore + object uses ``swh.core.db.BaseDb`` to interact with the actual database. + +- The datastore object should provide a ``get_current_version()`` method + returning the database version expected by the code. + +See existing ``swh`` packages like ``swh.storage`` or ``swh.scheduler`` for +usage examples. + +Writing tests +------------- + +The ``swh.core.db.pytest_plugin`` provides a few helper tools to write unit +tests for postgresql based datastores. + +By default, when using these fixtures, a posgresql server will be started (by +the pytest_postgresql fixture) and a template database will be created using +the ``postgresql_proc`` fixture factory provided by ``pytest_postgresql``. + +Then a dedicated fixture must be declared to use the ``postgresql_proc`` +fixture generated by the fixture factory function. + +This template database will then be used to create a new database for test +using this dedicated fixture. + +In order to help the database initialization process and make it consistent +with the database initialization tools from the ``swh db`` cli, an +``initialize_database_for_module()`` function is provided to be used with the +fixture factory described above. + +Typically, writing tests for a ``swh`` package ``swh.example`` would look like: + +.. code-block:: python + + from functools import partial + + from pytest_postgresql import factories + from swh.core.db.pytest_plugin import postgresql_fact + from swh.core.db.pytest_plugin import initialize_database_for_module + + example_postgresql_proc = factories.postgresql_proc( + dbname="example", + load=[partial(initialize_database_for_module, + modname="example", version=1)] + ) + + postgresql_example = postgresql_fact("example_postgresql_proc") + + def test_example(postgresql_example): + with postgresql_example.cursor() as c: + c.execute("select version from dbversion limit 1") + assert c.fecthone()[0] == 1 + + +Note: most of the time, you will want to put the scaffolding part of the code +above in a ``conftest.py`` file. + + +The ``load`` argument of the ``factories.postgresql_proc`` will be used to +initialize the template database that will be used to create a new database for +each test, while the ``load`` argument of the ``postgresql_fact`` fixture will +be executed before each test (in the database created from the template +database and dedicated to the test being executed). diff --git a/docs/index.rst b/docs/index.rst index 81d3fba..e48fb07 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,12 +1,13 @@ .. _swh-core: .. include:: README.rst Reference Documentation ----------------------- .. toctree:: :maxdepth: 2 cli + db /apidoc/swh.core diff --git a/requirements-db-pytestplugin.txt b/requirements-db-pytestplugin.txt index a15d41b..815fedd 100644 --- a/requirements-db-pytestplugin.txt +++ b/requirements-db-pytestplugin.txt @@ -1,2 +1,2 @@ # requirements for swh.core.db.pytest_plugin -pytest-postgresql < 4.0.0 # version 4.0 depends on psycopg 3. https://github.com/ClearcodeHQ/pytest-postgresql/blob/main/CHANGES.rst#400 +pytest-postgresql >=3, < 4.0.0 # version 4.0 depends on psycopg 3. https://github.com/ClearcodeHQ/pytest-postgresql/blob/main/CHANGES.rst#400 diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index 69719e6..45e95ea 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,42 +1,42 @@ Metadata-Version: 2.1 Name: swh.core -Version: 1.1.1 +Version: 2.0.0 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-core/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing-core Provides-Extra: logging Provides-Extra: db Provides-Extra: http Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Core foundations ==================================== Low-level utilities and helpers used by almost all other modules in the stack. core library for swh's modules: - config parser - serialization - logging mechanism - database connection - http-based RPC client/server diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index 103ab48..b68ffd3 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,118 +1,131 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-db-pytestplugin.txt requirements-db.txt requirements-http.txt requirements-logging.txt requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/cli.rst docs/conf.py +docs/db.rst docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/entry_points.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api_async.py swh/core/collections.py swh/core/config.py swh/core/logger.py swh/core/py.typed swh/core/pytest_plugin.py swh/core/sentry.py swh/core/statsd.py swh/core/tarball.py swh/core/utils.py swh/core/api/__init__.py swh/core/api/asynchronous.py swh/core/api/classes.py swh/core/api/gunicorn_config.py swh/core/api/negotiation.py swh/core/api/serializers.py swh/core/api/tests/__init__.py swh/core/api/tests/conftest.py swh/core/api/tests/server_testing.py swh/core/api/tests/test_async.py swh/core/api/tests/test_classes.py swh/core/api/tests/test_gunicorn.py swh/core/api/tests/test_init.py swh/core/api/tests/test_rpc_client.py swh/core/api/tests/test_rpc_client_server.py swh/core/api/tests/test_rpc_server.py swh/core/api/tests/test_rpc_server_asynchronous.py swh/core/api/tests/test_serializers.py swh/core/cli/__init__.py swh/core/cli/db.py swh/core/db/__init__.py swh/core/db/common.py swh/core/db/db_utils.py swh/core/db/pytest_plugin.py +swh/core/db/sql/35-dbversion.sql +swh/core/db/sql/36-dbmodule.sql swh/core/db/tests/__init__.py swh/core/db/tests/conftest.py swh/core/db/tests/test_cli.py swh/core/db/tests/test_db.py -swh/core/db/tests/data/cli/0-superuser-init.sql -swh/core/db/tests/data/cli/1-schema.sql -swh/core/db/tests/data/cli/3-func.sql -swh/core/db/tests/data/cli/4-data.sql +swh/core/db/tests/test_db_utils.py +swh/core/db/tests/data/cli/sql/0-superuser-init.sql +swh/core/db/tests/data/cli/sql/30-schema.sql +swh/core/db/tests/data/cli/sql/40-funcs.sql +swh/core/db/tests/data/cli/sql/50-data.sql +swh/core/db/tests/data/cli_new/sql/0-superuser-init.sql +swh/core/db/tests/data/cli_new/sql/30-schema.sql +swh/core/db/tests/data/cli_new/sql/40-funcs.sql +swh/core/db/tests/data/cli_new/sql/50-data.sql +swh/core/db/tests/data/cli_new/sql/upgrades/001.sql +swh/core/db/tests/data/cli_new/sql/upgrades/002.sql +swh/core/db/tests/data/cli_new/sql/upgrades/003.sql +swh/core/db/tests/data/cli_new/sql/upgrades/004.sql +swh/core/db/tests/data/cli_new/sql/upgrades/005.sql +swh/core/db/tests/data/cli_new/sql/upgrades/006.sql swh/core/db/tests/pytest_plugin/__init__.py swh/core/db/tests/pytest_plugin/test_pytest_plugin.py swh/core/db/tests/pytest_plugin/data/0-schema.sql swh/core/db/tests/pytest_plugin/data/1-data.sql -swh/core/sql/log-schema.sql swh/core/tests/__init__.py swh/core/tests/test_cli.py swh/core/tests/test_collections.py swh/core/tests/test_config.py swh/core/tests/test_logger.py swh/core/tests/test_pytest_plugin.py swh/core/tests/test_statsd.py swh/core/tests/test_tarball.py swh/core/tests/test_utils.py swh/core/tests/data/archives/groff-1.02.tar.Z swh/core/tests/data/archives/hello.tar swh/core/tests/data/archives/hello.tar.bz2 swh/core/tests/data/archives/hello.tar.gz swh/core/tests/data/archives/hello.tar.lz swh/core/tests/data/archives/hello.tar.x swh/core/tests/data/archives/hello.tbz swh/core/tests/data/archives/hello.tbz2 swh/core/tests/data/archives/hello.zip swh/core/tests/data/archives/msk316src.zip swh/core/tests/data/archives/tokei-12.1.2.crate swh/core/tests/data/http_example.com/something.json swh/core/tests/data/https_example.com/file.json swh/core/tests/data/https_example.com/file.json,name=doe,firstname=jane swh/core/tests/data/https_example.com/file.json_visit1 swh/core/tests/data/https_example.com/other.json swh/core/tests/data/https_forge.s.o/api_diffusion,attachments[uris]=1 swh/core/tests/data/https_www.reference.com/web,q=What+Is+an+Example+of+a+URL?,qo=contentPageRelatedSearch,o=600605,l=dir,sga=1 swh/core/tests/fixture/__init__.py swh/core/tests/fixture/conftest.py swh/core/tests/fixture/test_pytest_plugin.py swh/core/tests/fixture/data/https_example.com/file.json \ No newline at end of file diff --git a/swh.core.egg-info/entry_points.txt b/swh.core.egg-info/entry_points.txt index aac40d0..b07cbd6 100644 --- a/swh.core.egg-info/entry_points.txt +++ b/swh.core.egg-info/entry_points.txt @@ -1,9 +1,9 @@ +[console_scripts] +swh = swh.core.cli:main +swh-db-init = swh.core.cli.db:db_init - [console_scripts] - swh=swh.core.cli:main - swh-db-init=swh.core.cli.db:db_init - [swh.cli.subcommands] - db=swh.core.cli.db - [pytest11] - pytest_swh_core = swh.core.pytest_plugin - \ No newline at end of file +[pytest11] +pytest_swh_core = swh.core.pytest_plugin + +[swh.cli.subcommands] +db = swh.core.cli.db diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index f297994..34f80b6 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,57 +1,57 @@ click deprecated python-magic pyyaml sentry-sdk [db] psycopg2 typing-extensions -pytest-postgresql<4.0.0 +pytest-postgresql<4.0.0,>=3 [http] aiohttp aiohttp_utils>=3.1.1 blinker flask iso8601 msgpack>=1.0.0 requests [logging] systemd-python [testing] hypothesis>=3.11.0 pytest<7.0.0 pytest-mock pytz requests-mock types-click types-flask types-pytz types-pyyaml types-requests psycopg2 typing-extensions -pytest-postgresql<4.0.0 +pytest-postgresql<4.0.0,>=3 aiohttp aiohttp_utils>=3.1.1 blinker flask iso8601 msgpack>=1.0.0 requests systemd-python [testing-core] hypothesis>=3.11.0 pytest<7.0.0 pytest-mock pytz requests-mock types-click types-flask types-pytz types-pyyaml types-requests diff --git a/swh/core/cli/db.py b/swh/core/cli/db.py index 3958772..1e094d8 100755 --- a/swh/core/cli/db.py +++ b/swh/core/cli/db.py @@ -1,335 +1,409 @@ #!/usr/bin/env python3 -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from os import environ, path -from typing import Collection, Dict, Optional, Tuple +from os import environ import warnings import click from swh.core.cli import CONTEXT_SETTINGS from swh.core.cli import swh as swh_cli_group warnings.filterwarnings("ignore") # noqa prevent psycopg from telling us sh*t logger = logging.getLogger(__name__) @swh_cli_group.group(name="db", context_settings=CONTEXT_SETTINGS) @click.option( "--config-file", "-C", default=None, type=click.Path(exists=True, dir_okay=False), help="Configuration file.", ) @click.pass_context def db(ctx, config_file): """Software Heritage database generic tools.""" from swh.core.config import read as config_read ctx.ensure_object(dict) if config_file is None: config_file = environ.get("SWH_CONFIG_FILENAME") cfg = config_read(config_file) ctx.obj["config"] = cfg @db.command(name="create", context_settings=CONTEXT_SETTINGS) @click.argument("module", required=True) @click.option( "--dbname", "--db-name", "-d", help="Database name.", default="softwareheritage-dev", show_default=True, ) @click.option( "--template", "-T", help="Template database from which to build this database.", default="template1", show_default=True, ) def db_create(module, dbname, template): """Create a database for the Software Heritage . and potentially execute superuser-level initialization steps. Example:: swh db create -d swh-test storage If you want to specify non-default postgresql connection parameters, please provide them using standard environment variables or by the mean of a properly crafted libpq connection URI. See psql(1) man page (section ENVIRONMENTS) for details. Note: this command requires a postgresql connection with superuser permissions. Example:: PGPORT=5434 swh db create indexer swh db create -d postgresql://superuser:passwd@pghost:5433/swh-storage storage """ + from swh.core.db.db_utils import create_database_for_package logger.debug("db_create %s dn_name=%s", module, dbname) create_database_for_package(module, dbname, template) @db.command(name="init-admin", context_settings=CONTEXT_SETTINGS) @click.argument("module", required=True) @click.option( "--dbname", "--db-name", "-d", help="Database name.", default="softwareheritage-dev", show_default=True, ) def db_init_admin(module: str, dbname: str) -> None: """Execute superuser-level initialization steps (e.g pg extensions, admin functions, ...) Example:: PGPASSWORD=... swh db init-admin -d swh-test scheduler If you want to specify non-default postgresql connection parameters, please provide them using standard environment variables or by the mean of a properly crafted libpq connection URI. See psql(1) man page (section ENVIRONMENTS) for details. Note: this command requires a postgresql connection with superuser permissions (e.g postgres, swh-admin, ...) Example:: PGPORT=5434 swh db init-admin scheduler swh db init-admin -d postgresql://superuser:passwd@pghost:5433/swh-scheduler \ scheduler """ + from swh.core.db.db_utils import init_admin_extensions + logger.debug("db_init_admin %s dbname=%s", module, dbname) init_admin_extensions(module, dbname) @db.command(name="init", context_settings=CONTEXT_SETTINGS) @click.argument("module", required=True) @click.option( "--dbname", "--db-name", "-d", - help="Database name.", - default="softwareheritage-dev", - show_default=True, + help="Database name or connection URI.", + default=None, + show_default=False, ) @click.option( "--flavor", help="Database flavor.", default=None, ) -def db_init(module, dbname, flavor): +@click.option( + "--initial-version", help="Database initial version.", default=1, show_default=True +) +@click.pass_context +def db_init(ctx, module, dbname, flavor, initial_version): """Initialize a database for the Software Heritage . - Example:: + The database connection string comes from the configuration file (see + option ``--config-file`` in ``swh db --help``) in the section named after + the MODULE argument. - swh db init -d swh-test storage + Example:: - If you want to specify non-default postgresql connection parameters, - please provide them using standard environment variables. - See psql(1) man page (section ENVIRONMENTS) for details. + $ cat conf.yml + storage: + cls: postgresql + db: postgresql://user:passwd@pghost:5433/swh-storage + objstorage: + cls: memory - Examples:: + $ swh db -C conf.yml init storage # or + $ SWH_CONFIG_FILENAME=conf.yml swh db init storage - PGPORT=5434 swh db init indexer - swh db init -d postgresql://user:passwd@pghost:5433/swh-storage storage - swh db init --flavor read_replica -d swh-storage storage + Note that the connection string can also be passed directly using the + '--db-name' option, but this usage is about to be deprecated. """ + from swh.core.db.db_utils import ( + get_database_info, + import_swhmodule, + populate_database_for_package, + swh_set_db_version, + ) + + cfg = None + if dbname is None: + # use the db cnx from the config file; the expected config entry is the + # given module name + cfg = ctx.obj["config"].get(module, {}) + dbname = get_dburl_from_config(cfg) + + if not dbname: + raise click.BadParameter( + "Missing the postgresql connection configuration. Either fix your " + "configuration file or use the --dbname option." + ) logger.debug("db_init %s flavor=%s dbname=%s", module, flavor, dbname) initialized, dbversion, dbflavor = populate_database_for_package( module, dbname, flavor ) + if dbversion is None: + if cfg is not None: + # db version has not been populated by sql init scripts (new style), + # let's do it; instantiate the data source to retrieve the current + # (expected) db version + datastore_factory = getattr(import_swhmodule(module), "get_datastore", None) + if datastore_factory: + datastore = datastore_factory(**cfg) + try: + get_current_version = datastore.get_current_version + except AttributeError: + logger.warning( + "Datastore %s does not implement the " + "'get_current_version()' method", + datastore, + ) + else: + code_version = get_current_version() + logger.info( + "Initializing database version to %s from the %s datastore", + code_version, + module, + ) + swh_set_db_version(dbname, code_version, desc="DB initialization") + + dbversion = get_database_info(dbname)[1] + if dbversion is None: + logger.info( + "Initializing database version to %s " + "from the command line option --initial-version", + initial_version, + ) + swh_set_db_version(dbname, initial_version, desc="DB initialization") + + dbversion = get_database_info(dbname)[1] + assert dbversion is not None # TODO: Ideally migrate the version from db_version to the latest # db version click.secho( "DONE database for {} {}{} at version {}".format( module, "initialized" if initialized else "exists", f" (flavor {dbflavor})" if dbflavor is not None else "", dbversion, ), fg="green", bold=True, ) if flavor is not None and dbflavor != flavor: click.secho( f"WARNING requested flavor '{flavor}' != recorded flavor '{dbflavor}'", fg="red", bold=True, ) -def get_sql_for_package(modname): - import glob - from importlib import import_module +@db.command(name="version", context_settings=CONTEXT_SETTINGS) +@click.argument("module", required=True) +@click.option( + "--all/--no-all", + "show_all", + help="Show version history.", + default=False, + show_default=True, +) +@click.pass_context +def db_version(ctx, module, show_all): + """Print the database version for the Software Heritage. + + Example:: + + swh db version -d swh-test - from swh.core.utils import numfile_sortkey as sortkey + """ + from swh.core.db.db_utils import get_database_info, import_swhmodule - if not modname.startswith("swh."): - modname = "swh.{}".format(modname) - try: - m = import_module(modname) - except ImportError: - raise click.BadParameter("Unable to load module {}".format(modname)) + # use the db cnx from the config file; the expected config entry is the + # given module name + cfg = ctx.obj["config"].get(module, {}) + dbname = get_dburl_from_config(cfg) - sqldir = path.join(path.dirname(m.__file__), "sql") - if not path.isdir(sqldir): + if not dbname: raise click.BadParameter( - "Module {} does not provide a db schema " "(no sql/ dir)".format(modname) + "Missing the postgresql connection configuration. Either fix your " + "configuration file or use the --dbname option." ) - return sorted(glob.glob(path.join(sqldir, "*.sql")), key=sortkey) - -def populate_database_for_package( - modname: str, conninfo: str, flavor: Optional[str] = None -) -> Tuple[bool, int, Optional[str]]: - """Populate the database, pointed at with ``conninfo``, - using the SQL files found in the package ``modname``. + logger.debug("db_version dbname=%s", dbname) - Args: - modname: Name of the module of which we're loading the files - conninfo: connection info string for the SQL database - flavor: the module-specific flavor which we want to initialize the database under + db_module, db_version, db_flavor = get_database_info(dbname) + if db_module is None: + click.secho( + "WARNING the database does not have a dbmodule table.", fg="red", bold=True + ) + db_module = module + assert db_module == module, f"{db_module} (in the db) != {module} (given)" - Returns: - Tuple with three elements: whether the database has been initialized; the current - version of the database; if it exists, the flavor of the database. - """ - from swh.core.db.db_utils import swh_db_flavor, swh_db_version + click.secho(f"module: {db_module}", fg="green", bold=True) - current_version = swh_db_version(conninfo) - if current_version is not None: - dbflavor = swh_db_flavor(conninfo) - return False, current_version, dbflavor + if db_flavor is not None: + click.secho(f"flavor: {db_flavor}", fg="green", bold=True) - sqlfiles = get_sql_for_package(modname) - sqlfiles = [fname for fname in sqlfiles if "-superuser-" not in fname] - execute_sqlfiles(sqlfiles, conninfo, flavor) + # instantiate the data source to retrieve the current (expected) db version + datastore_factory = getattr(import_swhmodule(db_module), "get_datastore", None) + if datastore_factory: + datastore = datastore_factory(**cfg) + code_version = datastore.get_current_version() + click.secho( + f"current code version: {code_version}", + fg="green" if code_version == db_version else "red", + bold=True, + ) - current_version = swh_db_version(conninfo) - assert current_version is not None - dbflavor = swh_db_flavor(conninfo) - return True, current_version, dbflavor + if not show_all: + click.secho(f"version: {db_version}", fg="green", bold=True) + else: + from swh.core.db.db_utils import swh_db_versions + versions = swh_db_versions(dbname) + for version, tstamp, desc in versions: + click.echo(f"{version} [{tstamp}] {desc}") -def parse_dsn_or_dbname(dsn_or_dbname: str) -> Dict[str, str]: - """Parse a psycopg2 dsn, falling back to supporting plain database names as well""" - import psycopg2 - from psycopg2.extensions import parse_dsn as _parse_dsn - try: - return _parse_dsn(dsn_or_dbname) - except psycopg2.ProgrammingError: - # psycopg2 failed to parse the DSN; it's probably a database name, - # handle it as such - return _parse_dsn(f"dbname={dsn_or_dbname}") +@db.command(name="upgrade", context_settings=CONTEXT_SETTINGS) +@click.argument("module", required=True) +@click.option( + "--to-version", + type=int, + help="Upgrade up to version VERSION", + metavar="VERSION", + default=None, +) +@click.pass_context +def db_upgrade(ctx, module, to_version): + """Upgrade the database for given module (to a given version if specified). + Examples:: -def init_admin_extensions(modname: str, conninfo: str) -> None: - """The remaining initialization process -- running -superuser- SQL files -- is done - using the given conninfo, thus connecting to the newly created database + swh db upgrade storage + swg db upgrade scheduler --to-version=10 """ - sqlfiles = get_sql_for_package(modname) - sqlfiles = [fname for fname in sqlfiles if "-superuser-" in fname] - execute_sqlfiles(sqlfiles, conninfo) + from swh.core.db.db_utils import ( + get_database_info, + import_swhmodule, + swh_db_upgrade, + swh_set_db_module, + ) + + # use the db cnx from the config file; the expected config entry is the + # given module name + cfg = ctx.obj["config"].get(module, {}) + dbname = get_dburl_from_config(cfg) + if not dbname: + raise click.BadParameter( + "Missing the postgresql connection configuration. Either fix your " + "configuration file or use the --dbname option." + ) -def create_database_for_package( - modname: str, conninfo: str, template: str = "template1" -): - """Create the database pointed at with ``conninfo``, and initialize it using - -superuser- SQL files found in the package ``modname``. + logger.debug("db_version dbname=%s", dbname) - Args: - modname: Name of the module of which we're loading the files - conninfo: connection info string or plain database name for the SQL database - template: the name of the database to connect to and use as template to create - the new database + db_module, db_version, db_flavor = get_database_info(dbname) + if db_module is None: + click.secho( + "Warning: the database does not have a dbmodule table.", + fg="yellow", + bold=True, + ) + if not click.confirm( + f"Write the module information ({module}) in the database?", default=True + ): + raise click.BadParameter("Migration aborted.") + swh_set_db_module(dbname, module) + db_module = module + + if db_module != module: + raise click.BadParameter( + f"Error: the given module ({module}) does not match the value " + f"stored in the database ({db_module})." + ) - """ - import subprocess - - from psycopg2.extensions import make_dsn - - # Use the given conninfo string, but with dbname replaced by the template dbname - # for the database creation step - creation_dsn = parse_dsn_or_dbname(conninfo) - dbname = creation_dsn["dbname"] - creation_dsn["dbname"] = template - logger.debug("db_create dbname=%s (from %s)", dbname, template) - subprocess.check_call( - [ - "psql", - "--quiet", - "--no-psqlrc", - "-v", - "ON_ERROR_STOP=1", - "-d", - make_dsn(**creation_dsn), - "-c", - f'CREATE DATABASE "{dbname}"', - ] - ) - init_admin_extensions(modname, conninfo) + # instantiate the data source to retrieve the current (expected) db version + datastore_factory = getattr(import_swhmodule(db_module), "get_datastore", None) + if not datastore_factory: + raise click.UsageError( + "You cannot use this command on old-style datastore backend {db_module}" + ) + datastore = datastore_factory(**cfg) + ds_version = datastore.get_current_version() + if to_version is None: + to_version = ds_version + if to_version > ds_version: + raise click.UsageError( + f"The target version {to_version} is larger than the current version " + f"{ds_version} of the datastore backend {db_module}" + ) + new_db_version = swh_db_upgrade(dbname, module, to_version) + click.secho(f"Migration to version {new_db_version} done", fg="green") + if new_db_version < ds_version: + click.secho( + f"Warning: migration was not complete: the current version is {ds_version}", + fg="yellow", + ) -def execute_sqlfiles( - sqlfiles: Collection[str], conninfo: str, flavor: Optional[str] = None -): - """Execute a list of SQL files on the database pointed at with ``conninfo``. - Args: - sqlfiles: List of SQL files to execute - conninfo: connection info string for the SQL database - flavor: the database flavor to initialize - """ - import subprocess - - psql_command = [ - "psql", - "--quiet", - "--no-psqlrc", - "-v", - "ON_ERROR_STOP=1", - "-d", - conninfo, - ] - - flavor_set = False - for sqlfile in sqlfiles: - logger.debug(f"execute SQL file {sqlfile} dbname={conninfo}") - subprocess.check_call(psql_command + ["-f", sqlfile]) - - if flavor is not None and not flavor_set and sqlfile.endswith("-flavor.sql"): - logger.debug("Setting database flavor %s", flavor) - query = f"insert into dbflavor (flavor) values ('{flavor}')" - subprocess.check_call(psql_command + ["-c", query]) - flavor_set = True - - if flavor is not None and not flavor_set: - logger.warn( - "Asked for flavor %s, but module does not support database flavors", flavor, +def get_dburl_from_config(cfg): + if cfg.get("cls") != "postgresql": + raise click.BadParameter( + "Configuration cls must be set to 'postgresql' for this command." ) + if "args" in cfg: + # for bw compat + cfg = cfg["args"] + return cfg.get("db") diff --git a/swh/core/db/db_utils.py b/swh/core/db/db_utils.py index 1c9088e..92491ac 100644 --- a/swh/core/db/db_utils.py +++ b/swh/core/db/db_utils.py @@ -1,252 +1,664 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone import functools +from importlib import import_module import logging +from os import path +import pathlib import re -from typing import Optional, Union +import subprocess +from typing import Collection, Dict, List, Optional, Tuple, Union import psycopg2 import psycopg2.extensions +from psycopg2.extensions import connection as pgconnection +from psycopg2.extensions import encodings as pgencodings +from psycopg2.extensions import make_dsn +from psycopg2.extensions import parse_dsn as _parse_dsn + +from swh.core.utils import numfile_sortkey as sortkey logger = logging.getLogger(__name__) +def now(): + return datetime.now(tz=timezone.utc) + + def stored_procedure(stored_proc): """decorator to execute remote stored procedure, specified as argument Generally, the body of the decorated function should be empty. If it is not, the stored procedure will be executed first; the function body then. """ def wrap(meth): @functools.wraps(meth) def _meth(self, *args, **kwargs): cur = kwargs.get("cur", None) self._cursor(cur).execute("SELECT %s()" % stored_proc) meth(self, *args, **kwargs) return _meth return wrap def jsonize(value): """Convert a value to a psycopg2 JSON object if necessary""" if isinstance(value, dict): return psycopg2.extras.Json(value) return value -def connect_to_conninfo( - db_or_conninfo: Union[str, psycopg2.extensions.connection] -) -> psycopg2.extensions.connection: +def connect_to_conninfo(db_or_conninfo: Union[str, pgconnection]) -> pgconnection: """Connect to the database passed in argument Args: db_or_conninfo: A database connection, or a database connection info string Returns: a connected database handle Raises: psycopg2.Error if the database doesn't exist """ - if isinstance(db_or_conninfo, psycopg2.extensions.connection): + if isinstance(db_or_conninfo, pgconnection): return db_or_conninfo if "=" not in db_or_conninfo and "//" not in db_or_conninfo: # Database name db_or_conninfo = f"dbname={db_or_conninfo}" db = psycopg2.connect(db_or_conninfo) return db -def swh_db_version( - db_or_conninfo: Union[str, psycopg2.extensions.connection] -) -> Optional[int]: +def swh_db_version(db_or_conninfo: Union[str, pgconnection]) -> Optional[int]: """Retrieve the swh version of the database. If the database is not initialized, this logs a warning and returns None. Args: db_or_conninfo: A database connection, or a database connection info string Returns: Either the version of the database, or None if it couldn't be detected """ try: db = connect_to_conninfo(db_or_conninfo) except psycopg2.Error: logger.exception("Failed to connect to `%s`", db_or_conninfo) # Database not initialized return None try: with db.cursor() as c: query = "select version from dbversion order by dbversion desc limit 1" try: c.execute(query) - return c.fetchone()[0] + result = c.fetchone() + if result: + return result[0] except psycopg2.errors.UndefinedTable: return None except Exception: logger.exception("Could not get version from `%s`", db_or_conninfo) + return None + + +def swh_db_versions( + db_or_conninfo: Union[str, pgconnection] +) -> Optional[List[Tuple[int, datetime, str]]]: + """Retrieve the swh version history of the database. + + If the database is not initialized, this logs a warning and returns None. + + Args: + db_or_conninfo: A database connection, or a database connection info string + + Returns: + Either the version of the database, or None if it couldn't be detected + """ + try: + db = connect_to_conninfo(db_or_conninfo) + except psycopg2.Error: + logger.exception("Failed to connect to `%s`", db_or_conninfo) + # Database not initialized + return None + + try: + with db.cursor() as c: + query = ( + "select version, release, description " + "from dbversion order by dbversion desc" + ) + try: + c.execute(query) + return c.fetchall() + except psycopg2.errors.UndefinedTable: + return None + except Exception: + logger.exception("Could not get versions from `%s`", db_or_conninfo) + return None + + +def swh_db_upgrade( + conninfo: str, modname: str, to_version: Optional[int] = None +) -> int: + """Upgrade the database at `conninfo` for module `modname` + + This will run migration scripts found in the `sql/upgrades` subdirectory of + the module `modname`. By default, this will upgrade to the latest declared version. + + Args: + conninfo: A database connection, or a database connection info string + modname: datastore module the database stores content for + to_version: if given, update the database to this version rather than the latest + + """ + + if to_version is None: + to_version = 99999999 + + db_module, db_version, db_flavor = get_database_info(conninfo) + if db_version is None: + raise ValueError("Unable to retrieve the current version of the database") + if db_module is None: + raise ValueError("Unable to retrieve the module of the database") + if db_module != modname: + raise ValueError( + "The stored module of the database is different than the given one" + ) + + sqlfiles = [ + fname + for fname in get_sql_for_package(modname, upgrade=True) + if db_version < int(fname.stem) <= to_version + ] + + for sqlfile in sqlfiles: + new_version = int(path.splitext(path.basename(sqlfile))[0]) + logger.info("Executing migration script {sqlfile}") + if db_version is not None and (new_version - db_version) > 1: + logger.error( + f"There are missing migration steps between {db_version} and " + f"{new_version}. It might be expected but it most unlikely is not. " + "Will stop here." + ) + return db_version + + execute_sqlfiles([sqlfile], conninfo, db_flavor) + + # check if the db version has been updated by the upgrade script + db_version = swh_db_version(conninfo) + assert db_version is not None + if db_version == new_version: + # nothing to do, upgrade script did the job + pass + elif db_version == new_version - 1: + # it has not (new style), so do it + swh_set_db_version( + conninfo, + new_version, + desc=f"Upgraded to version {new_version} using {sqlfile}", + ) + db_version = swh_db_version(conninfo) + else: + # upgrade script did it wrong + logger.error( + f"The upgrade script {sqlfile} did not update the dbversion table " + f"consistently ({db_version} vs. expected {new_version}). " + "Will stop migration here. Please check your migration scripts." + ) + return db_version + return new_version + + +def swh_db_module(db_or_conninfo: Union[str, pgconnection]) -> Optional[str]: + """Retrieve the swh module used to create the database. + + If the database is not initialized, this logs a warning and returns None. + + Args: + db_or_conninfo: A database connection, or a database connection info string + + Returns: + Either the module of the database, or None if it couldn't be detected + """ + try: + db = connect_to_conninfo(db_or_conninfo) + except psycopg2.Error: + logger.exception("Failed to connect to `%s`", db_or_conninfo) + # Database not initialized + return None + + try: + with db.cursor() as c: + query = "select dbmodule from dbmodule limit 1" + try: + c.execute(query) + resp = c.fetchone() + if resp: + return resp[0] + except psycopg2.errors.UndefinedTable: + return None + except Exception: + logger.exception("Could not get module from `%s`", db_or_conninfo) + return None + + +def swh_set_db_module( + db_or_conninfo: Union[str, pgconnection], module: str, force=False +) -> None: + """Set the swh module used to create the database. + + Fails if the dbmodule is already set or the table does not exist. + + Args: + db_or_conninfo: A database connection, or a database connection info string + module: the swh module to register (without the leading 'swh.') + """ + update = False + if module.startswith("swh."): + module = module[4:] + + current_module = swh_db_module(db_or_conninfo) + if current_module is not None: + if current_module == module: + logger.warning("The database module is already set to %s", module) + return + + if not force: + raise ValueError( + "The database module is already set to a value %s " + "different than given %s", + current_module, + module, + ) + # force is True + update = True + try: + db = connect_to_conninfo(db_or_conninfo) + except psycopg2.Error: + logger.exception("Failed to connect to `%s`", db_or_conninfo) + # Database not initialized return None + sqlfiles = [ + fname + for fname in get_sql_for_package("swh.core.db") + if "dbmodule" in fname.stem + ] + execute_sqlfiles(sqlfiles, db_or_conninfo) + + with db.cursor() as c: + if update: + query = "update dbmodule set dbmodule = %s" + else: + query = "insert into dbmodule(dbmodule) values (%s)" + c.execute(query, (module,)) + db.commit() + + +def swh_set_db_version( + db_or_conninfo: Union[str, pgconnection], + version: int, + ts: Optional[datetime] = None, + desc: str = "Work in progress", +) -> None: + """Set the version of the database. -def swh_db_flavor( - db_or_conninfo: Union[str, psycopg2.extensions.connection] -) -> Optional[str]: + Fails if the dbversion table does not exists. + + Args: + db_or_conninfo: A database connection, or a database connection info string + version: the version to add + """ + try: + db = connect_to_conninfo(db_or_conninfo) + except psycopg2.Error: + logger.exception("Failed to connect to `%s`", db_or_conninfo) + # Database not initialized + return None + if ts is None: + ts = now() + with db.cursor() as c: + query = ( + "insert into dbversion(version, release, description) values (%s, %s, %s)" + ) + c.execute(query, (version, ts, desc)) + db.commit() + + +def swh_db_flavor(db_or_conninfo: Union[str, pgconnection]) -> Optional[str]: """Retrieve the swh flavor of the database. If the database is not initialized, or the database doesn't support flavors, this returns None. Args: db_or_conninfo: A database connection, or a database connection info string Returns: The flavor of the database, or None if it could not be detected. """ try: db = connect_to_conninfo(db_or_conninfo) except psycopg2.Error: logger.exception("Failed to connect to `%s`", db_or_conninfo) # Database not initialized return None try: with db.cursor() as c: query = "select swh_get_dbflavor()" try: c.execute(query) return c.fetchone()[0] except psycopg2.errors.UndefinedFunction: # function not found: no flavor return None except Exception: logger.exception("Could not get flavor from `%s`", db_or_conninfo) return None # The following code has been imported from psycopg2, version 2.7.4, # https://github.com/psycopg/psycopg2/tree/5afb2ce803debea9533e293eef73c92ffce95bcd # and modified by Software Heritage. # # Original file: lib/extras.py # # psycopg2 is free software: you can redistribute it and/or modify it under the # terms of the GNU Lesser General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. def _paginate(seq, page_size): """Consume an iterable and return it in chunks. Every chunk is at most `page_size`. Never return an empty chunk. """ page = [] it = iter(seq) while 1: try: for i in range(page_size): page.append(next(it)) yield page page = [] except StopIteration: if page: yield page return def _split_sql(sql): """Split *sql* on a single ``%s`` placeholder. Split on the %s, perform %% replacement and return pre, post lists of snippets. """ curr = pre = [] post = [] tokens = re.split(br"(%.)", sql) for token in tokens: if len(token) != 2 or token[:1] != b"%": curr.append(token) continue if token[1:] == b"s": if curr is pre: curr = post else: raise ValueError("the query contains more than one '%s' placeholder") elif token[1:] == b"%": curr.append(b"%") else: raise ValueError( "unsupported format character: '%s'" % token[1:].decode("ascii", "replace") ) if curr is pre: raise ValueError("the query doesn't contain any '%s' placeholder") return pre, post def execute_values_generator(cur, sql, argslist, template=None, page_size=100): """Execute a statement using SQL ``VALUES`` with a sequence of parameters. Rows returned by the query are returned through a generator. You need to consume the generator for the queries to be executed! :param cur: the cursor to use to execute the query. :param sql: the query to execute. It must contain a single ``%s`` placeholder, which will be replaced by a `VALUES list`__. Example: ``"INSERT INTO mytable (id, f1, f2) VALUES %s"``. :param argslist: sequence of sequences or dictionaries with the arguments to send to the query. The type and content must be consistent with *template*. :param template: the snippet to merge to every item in *argslist* to compose the query. - If the *argslist* items are sequences it should contain positional placeholders (e.g. ``"(%s, %s, %s)"``, or ``"(%s, %s, 42)``" if there are constants value...). - If the *argslist* items are mappings it should contain named placeholders (e.g. ``"(%(id)s, %(f1)s, 42)"``). If not specified, assume the arguments are sequence and use a simple positional template (i.e. ``(%s, %s, ...)``), with the number of placeholders sniffed by the first element in *argslist*. :param page_size: maximum number of *argslist* items to include in every statement. If there are more items the function will execute more than one statement. :param yield_from_cur: Whether to yield results from the cursor in this function directly. .. __: https://www.postgresql.org/docs/current/static/queries-values.html After the execution of the function the `cursor.rowcount` property will **not** contain a total result. """ # we can't just use sql % vals because vals is bytes: if sql is bytes # there will be some decoding error because of stupid codec used, and Py3 # doesn't implement % on bytes. if not isinstance(sql, bytes): - sql = sql.encode(psycopg2.extensions.encodings[cur.connection.encoding]) + sql = sql.encode(pgencodings[cur.connection.encoding]) pre, post = _split_sql(sql) for page in _paginate(argslist, page_size=page_size): if template is None: template = b"(" + b",".join([b"%s"] * len(page[0])) + b")" parts = pre[:] for args in page: parts.append(cur.mogrify(template, args)) parts.append(b",") parts[-1:] = post cur.execute(b"".join(parts)) yield from cur + + +def import_swhmodule(modname): + if not modname.startswith("swh."): + modname = f"swh.{modname}" + try: + m = import_module(modname) + except ImportError as exc: + logger.error(f"Could not load the {modname} module: {exc}") + return None + return m + + +def get_sql_for_package(modname: str, upgrade: bool = False) -> List[pathlib.Path]: + """Return the (sorted) list of sql script files for the given swh module + + If upgrade is True, return the list of available migration scripts, + otherwise, return the list of initialization scripts. + """ + m = import_swhmodule(modname) + if m is None: + raise ValueError(f"Module {modname} cannot be loaded") + + sqldir = pathlib.Path(m.__file__).parent / "sql" + if upgrade: + sqldir /= "upgrades" + if not sqldir.is_dir(): + raise ValueError( + "Module {} does not provide a db schema (no sql/ dir)".format(modname) + ) + return sorted(sqldir.glob("*.sql"), key=lambda x: sortkey(x.name)) + + +def populate_database_for_package( + modname: str, conninfo: str, flavor: Optional[str] = None +) -> Tuple[bool, Optional[int], Optional[str]]: + """Populate the database, pointed at with ``conninfo``, + using the SQL files found in the package ``modname``. + Also fill the 'dbmodule' table with the given ``modname``. + + Args: + modname: Name of the module of which we're loading the files + conninfo: connection info string for the SQL database + flavor: the module-specific flavor which we want to initialize the database under + + Returns: + Tuple with three elements: whether the database has been initialized; the current + version of the database; if it exists, the flavor of the database. + """ + current_version = swh_db_version(conninfo) + if current_version is not None: + dbflavor = swh_db_flavor(conninfo) + return False, current_version, dbflavor + + def globalsortkey(key): + "like sortkey but only on basenames" + return sortkey(path.basename(key)) + + sqlfiles = get_sql_for_package(modname) + get_sql_for_package("swh.core.db") + sqlfiles = sorted(sqlfiles, key=lambda x: sortkey(x.stem)) + sqlfiles = [fpath for fpath in sqlfiles if "-superuser-" not in fpath.stem] + execute_sqlfiles(sqlfiles, conninfo, flavor) + + # populate the dbmodule table + swh_set_db_module(conninfo, modname) + + current_db_version = swh_db_version(conninfo) + dbflavor = swh_db_flavor(conninfo) + return True, current_db_version, dbflavor + + +def get_database_info( + conninfo: str, +) -> Tuple[Optional[str], Optional[int], Optional[str]]: + """Get version, flavor and module of the db""" + dbmodule = swh_db_module(conninfo) + dbversion = swh_db_version(conninfo) + dbflavor = None + if dbversion is not None: + dbflavor = swh_db_flavor(conninfo) + return (dbmodule, dbversion, dbflavor) + + +def parse_dsn_or_dbname(dsn_or_dbname: str) -> Dict[str, str]: + """Parse a psycopg2 dsn, falling back to supporting plain database names as well""" + try: + return _parse_dsn(dsn_or_dbname) + except psycopg2.ProgrammingError: + # psycopg2 failed to parse the DSN; it's probably a database name, + # handle it as such + return _parse_dsn(f"dbname={dsn_or_dbname}") + + +def init_admin_extensions(modname: str, conninfo: str) -> None: + """The remaining initialization process -- running -superuser- SQL files -- is done + using the given conninfo, thus connecting to the newly created database + + """ + sqlfiles = get_sql_for_package(modname) + sqlfiles = [fname for fname in sqlfiles if "-superuser-" in fname.stem] + execute_sqlfiles(sqlfiles, conninfo) + + +def create_database_for_package( + modname: str, conninfo: str, template: str = "template1" +): + """Create the database pointed at with ``conninfo``, and initialize it using + -superuser- SQL files found in the package ``modname``. + + Args: + modname: Name of the module of which we're loading the files + conninfo: connection info string or plain database name for the SQL database + template: the name of the database to connect to and use as template to create + the new database + + """ + # Use the given conninfo string, but with dbname replaced by the template dbname + # for the database creation step + creation_dsn = parse_dsn_or_dbname(conninfo) + dbname = creation_dsn["dbname"] + creation_dsn["dbname"] = template + logger.debug("db_create dbname=%s (from %s)", dbname, template) + subprocess.check_call( + [ + "psql", + "--quiet", + "--no-psqlrc", + "-v", + "ON_ERROR_STOP=1", + "-d", + make_dsn(**creation_dsn), + "-c", + f'CREATE DATABASE "{dbname}"', + ] + ) + init_admin_extensions(modname, conninfo) + + +def execute_sqlfiles( + sqlfiles: Collection[pathlib.Path], conninfo: str, flavor: Optional[str] = None +): + """Execute a list of SQL files on the database pointed at with ``conninfo``. + + Args: + sqlfiles: List of SQL files to execute + conninfo: connection info string for the SQL database + flavor: the database flavor to initialize + """ + psql_command = [ + "psql", + "--quiet", + "--no-psqlrc", + "-v", + "ON_ERROR_STOP=1", + "-d", + conninfo, + ] + + flavor_set = False + for sqlfile in sqlfiles: + logger.debug(f"execute SQL file {sqlfile} dbname={conninfo}") + subprocess.check_call(psql_command + ["-f", str(sqlfile)]) + + if ( + flavor is not None + and not flavor_set + and sqlfile.name.endswith("-flavor.sql") + ): + logger.debug("Setting database flavor %s", flavor) + query = f"insert into dbflavor (flavor) values ('{flavor}')" + subprocess.check_call(psql_command + ["-c", query]) + flavor_set = True + + if flavor is not None and not flavor_set: + logger.warn( + "Asked for flavor %s, but module does not support database flavors", flavor, + ) diff --git a/swh/core/db/pytest_plugin.py b/swh/core/db/pytest_plugin.py index 0792d0b..9e5b2cb 100644 --- a/swh/core/db/pytest_plugin.py +++ b/swh/core/db/pytest_plugin.py @@ -1,190 +1,282 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob from importlib import import_module import logging import subprocess -from typing import List, Optional, Set, Union +from typing import Callable, Iterable, Iterator, List, Optional, Sequence, Set, Union from _pytest.fixtures import FixtureRequest import psycopg2 import pytest +from pytest_postgresql.compat import check_for_psycopg2, connection +from pytest_postgresql.executor import PostgreSQLExecutor +from pytest_postgresql.executor_noop import NoopExecutor from pytest_postgresql.janitor import DatabaseJanitor -from swh.core.utils import numfile_sortkey as sortkey +from swh.core.db.db_utils import ( + init_admin_extensions, + populate_database_for_package, + swh_set_db_version, +) +from swh.core.utils import basename_sortkey # to keep mypy happy regardless pytest-postgresql version try: _pytest_pgsql_get_config_module = import_module("pytest_postgresql.config") except ImportError: # pytest_postgresql < 3.0.0 _pytest_pgsql_get_config_module = import_module("pytest_postgresql.factories") _pytest_postgresql_get_config = getattr(_pytest_pgsql_get_config_module, "get_config") logger = logging.getLogger(__name__) class SWHDatabaseJanitor(DatabaseJanitor): """SWH database janitor implementation with a a different setup/teardown policy than than the stock one. Instead of dropping, creating and initializing the database for each test, it creates and initializes the db once, then truncates the tables (and sequences) in between tests. This is needed to have acceptable test performances. """ def __init__( self, user: str, host: str, - port: str, + port: int, dbname: str, version: Union[str, float], - dump_files: Union[None, str, List[str]] = None, + password: Optional[str] = None, + isolation_level: Optional[int] = None, + connection_timeout: int = 60, + dump_files: Optional[Union[str, Sequence[str]]] = None, no_truncate_tables: Set[str] = set(), + no_db_drop: bool = False, ) -> None: super().__init__(user, host, port, dbname, version) - if not hasattr(self, "dbname") and hasattr(self, "db_name"): - # pytest_postgresql < 3.0.0 - self.dbname = getattr(self, "db_name") - if dump_files is None: - self.dump_files = [] - elif isinstance(dump_files, str): - self.dump_files = sorted(glob.glob(dump_files), key=sortkey) - else: - self.dump_files = dump_files # do no truncate the following tables self.no_truncate_tables = set(no_truncate_tables) + self.no_db_drop = no_db_drop + self.dump_files = dump_files - def db_setup(self): + def psql_exec(self, fname: str) -> None: conninfo = ( f"host={self.host} user={self.user} port={self.port} dbname={self.dbname}" ) - for fname in self.dump_files: - subprocess.check_call( - [ - "psql", - "--quiet", - "--no-psqlrc", - "-v", - "ON_ERROR_STOP=1", - "-d", - conninfo, - "-f", - fname, - ] - ) + subprocess.check_call( + [ + "psql", + "--quiet", + "--no-psqlrc", + "-v", + "ON_ERROR_STOP=1", + "-d", + conninfo, + "-f", + fname, + ] + ) - def db_reset(self): + def db_reset(self) -> None: """Truncate tables (all but self.no_truncate_tables set) and sequences """ with psycopg2.connect( dbname=self.dbname, user=self.user, host=self.host, port=self.port, ) as cnx: with cnx.cursor() as cur: cur.execute( "SELECT table_name FROM information_schema.tables " "WHERE table_schema = %s", ("public",), ) all_tables = set(table for (table,) in cur.fetchall()) tables_to_truncate = all_tables - self.no_truncate_tables for table in tables_to_truncate: cur.execute("TRUNCATE TABLE %s CASCADE" % table) cur.execute( "SELECT sequence_name FROM information_schema.sequences " "WHERE sequence_schema = %s", ("public",), ) seqs = set(seq for (seq,) in cur.fetchall()) for seq in seqs: cur.execute("ALTER SEQUENCE %s RESTART;" % seq) cnx.commit() - def init(self): - """Initialize db. Create the db if it does not exist. Reset it if it exists.""" - with self.cursor() as cur: - cur.execute( - "SELECT COUNT(1) FROM pg_database WHERE datname=%s;", (self.dbname,) - ) - db_exists = cur.fetchone()[0] == 1 - if db_exists: - cur.execute( - "UPDATE pg_database SET datallowconn=true WHERE datname = %s;", - (self.dbname,), - ) - self.db_reset() - return + def _db_exists(self, cur, dbname): + cur.execute( + "SELECT EXISTS " + "(SELECT datname FROM pg_catalog.pg_database WHERE datname= %s);", + (dbname,), + ) + row = cur.fetchone() + return (row is not None) and row[0] - # initialize the inexistent db + def init(self) -> None: + """Create database in postgresql out of a template it if it exists, bare + creation otherwise.""" + template_name = f"{self.dbname}_tmpl" + logger.debug("Initialize DB %s", self.dbname) with self.cursor() as cur: - cur.execute('CREATE DATABASE "{}";'.format(self.dbname)) - self.db_setup() - - def drop(self): - """The original DatabaseJanitor implementation prevents new connections from happening, - destroys current opened connections and finally drops the database. - - We actually do not want to drop the db so we instead do nothing and resets - (truncate most tables and sequences) the db instead, in order to have some - acceptable performance. + tmpl_exists = self._db_exists(cur, template_name) + db_exists = self._db_exists(cur, self.dbname) + if not db_exists: + if tmpl_exists: + logger.debug( + "Create %s from template %s", self.dbname, template_name + ) + cur.execute( + f'CREATE DATABASE "{self.dbname}" TEMPLATE "{template_name}";' + ) + else: + logger.debug("Create %s from scratch", self.dbname) + cur.execute(f'CREATE DATABASE "{self.dbname}";') + if self.dump_files: + logger.warning( + "Using dump_files on the postgresql_fact fixture " + "is deprecated. See swh.core documentation for more " + "details." + ) + for dump_file in gen_dump_files(self.dump_files): + logger.info(f"Loading {dump_file}") + self.psql_exec(dump_file) + else: + logger.debug("Reset %s", self.dbname) + self.db_reset() - """ - pass + def drop(self) -> None: + """Drop database in postgresql.""" + if self.no_db_drop: + with self.cursor() as cur: + self._terminate_connection(cur, self.dbname) + else: + super().drop() # the postgres_fact factory fixture below is mostly a copy of the code # from pytest-postgresql. We need a custom version here to be able to # specify our version of the DBJanitor we use. def postgresql_fact( process_fixture_name: str, dbname: Optional[str] = None, - dump_files: Union[str, List[str]] = "", + load: Optional[Sequence[Union[Callable, str]]] = None, + isolation_level: Optional[int] = None, + modname: Optional[str] = None, + dump_files: Optional[Union[str, List[str]]] = None, no_truncate_tables: Set[str] = {"dbversion"}, -): + no_db_drop: bool = False, +) -> Callable[[FixtureRequest], Iterator[connection]]: + """ + Return connection fixture factory for PostgreSQL. + + :param process_fixture_name: name of the process fixture + :param dbname: database name + :param load: SQL, function or function import paths to automatically load + into our test database + :param isolation_level: optional postgresql isolation level + defaults to server's default + :param modname: (swh) module name for which the database is created + :dump_files: (deprecated, use load instead) list of sql script files to + execute after the database has been created + :no_truncate_tables: list of table not to truncate between tests (only used + when no_db_drop is True) + :no_db_drop: if True, keep the database between tests; in which case, the + database is reset (see SWHDatabaseJanitor.db_reset()) by truncating + most of the tables. Note that this makes de facto tests (potentially) + interdependent, use with extra caution. + :returns: function which makes a connection to postgresql + """ + @pytest.fixture - def postgresql_factory(request: FixtureRequest): - """Fixture factory for PostgreSQL. + def postgresql_factory(request: FixtureRequest) -> Iterator[connection]: + """ + Fixture factory for PostgreSQL. - :param FixtureRequest request: fixture request object - :rtype: psycopg2.connection + :param request: fixture request object :returns: postgresql client """ - config = _pytest_postgresql_get_config(request) - proc_fixture = request.getfixturevalue(process_fixture_name) + check_for_psycopg2() + proc_fixture: Union[PostgreSQLExecutor, NoopExecutor] = request.getfixturevalue( + process_fixture_name + ) pg_host = proc_fixture.host pg_port = proc_fixture.port pg_user = proc_fixture.user + pg_password = proc_fixture.password pg_options = proc_fixture.options - pg_db = dbname or config["dbname"] + pg_db = dbname or proc_fixture.dbname + pg_load = load or [] + assert pg_db is not None + with SWHDatabaseJanitor( pg_user, pg_host, pg_port, pg_db, proc_fixture.version, + pg_password, + isolation_level=isolation_level, dump_files=dump_files, no_truncate_tables=no_truncate_tables, - ): - connection = psycopg2.connect( + no_db_drop=no_db_drop, + ) as janitor: + db_connection: connection = psycopg2.connect( dbname=pg_db, user=pg_user, + password=pg_password, host=pg_host, port=pg_port, options=pg_options, ) - yield connection - connection.close() + for load_element in pg_load: + janitor.load(load_element) + try: + yield db_connection + finally: + db_connection.close() return postgresql_factory + + +def initialize_database_for_module(modname, version, **kwargs): + conninfo = psycopg2.connect(**kwargs).dsn + init_admin_extensions(modname, conninfo) + populate_database_for_package(modname, conninfo) + try: + swh_set_db_version(conninfo, version) + except psycopg2.errors.UniqueViolation: + logger.warn( + "Version already set by db init scripts. " + "This generally means the swh.{modname} package needs to be " + "updated for swh.core>=1.2" + ) + + +def gen_dump_files(dump_files: Union[str, Iterable[str]]) -> Iterator[str]: + """Generate files potentially resolving glob patterns if any + + """ + if isinstance(dump_files, str): + dump_files = [dump_files] + for dump_file in dump_files: + if glob.has_magic(dump_file): + # if the dump_file is a glob pattern one, resolve it + yield from ( + fname for fname in sorted(glob.glob(dump_file), key=basename_sortkey) + ) + else: + # otherwise, just return the filename + yield dump_file diff --git a/swh/core/db/sql/35-dbversion.sql b/swh/core/db/sql/35-dbversion.sql new file mode 100644 index 0000000..ee85ac7 --- /dev/null +++ b/swh/core/db/sql/35-dbversion.sql @@ -0,0 +1,18 @@ +-- common metadata/context structures +-- +-- we use a 35- prefix for this to make it executed after db schema initialisation +-- sql scripts, which are normally 30- prefixed, so that it remains compatible +-- with packages that have not yet migrated to swh.core 1.2 + +-- schema versions +create table if not exists dbversion +( + version int primary key, + release timestamptz, + description text +); + +comment on table dbversion is 'Details of current db version'; +comment on column dbversion.version is 'SQL schema version'; +comment on column dbversion.release is 'Version deployment timestamp'; +comment on column dbversion.description is 'Release description'; diff --git a/swh/core/db/sql/36-dbmodule.sql b/swh/core/db/sql/36-dbmodule.sql new file mode 100644 index 0000000..ae9a670 --- /dev/null +++ b/swh/core/db/sql/36-dbmodule.sql @@ -0,0 +1,15 @@ +-- common metadata/context structures +-- +-- we use a 3x- prefix for this to make it executed after db schema initialisation +-- sql scripts, which are normally 30- prefixed, so that it remains compatible +-- with packages that have not yet migrated to swh.core 1.2 + +-- swh module this db is storing data for +create table if not exists dbmodule ( + dbmodule text, + single_row char(1) primary key default 'x', + check (single_row = 'x') +); +comment on table dbmodule is 'Database module storage'; +comment on column dbmodule.dbmodule is 'Database (swh) module currently deployed'; +comment on column dbmodule.single_row is 'Bogus column to force the table to have a single row'; diff --git a/swh/core/db/tests/conftest.py b/swh/core/db/tests/conftest.py index 7be81e3..b1d42f4 100644 --- a/swh/core/db/tests/conftest.py +++ b/swh/core/db/tests/conftest.py @@ -1,12 +1,67 @@ +# Copyright (C) 2019-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + import os +from click.testing import CliRunner from hypothesis import HealthCheck +import pytest + +from swh.core.db.db_utils import import_swhmodule os.environ["LC_ALL"] = "C.UTF-8" # we use getattr here to keep mypy happy regardless hypothesis version function_scoped_fixture_check = ( [getattr(HealthCheck, "function_scoped_fixture")] if hasattr(HealthCheck, "function_scoped_fixture") else [] ) + + +@pytest.fixture +def cli_runner(): + return CliRunner() + + +@pytest.fixture() +def mock_import_swhmodule(mocker, datadir): + """This bypasses the module manipulation to make import_swhmodule return a mock + object suitable for data test files listing via get_sql_for_package. + + For a given module `test.`, return a MagicMock object with a __name__ + set to `` and __file__ pointing to `data//__init__.py`. + + The Mock object also defines a `get_datastore()` attribute on which the + `get_current_version()` exists and will return 42. + + Typical usage:: + + def test_xxx(cli_runner, mock_import_swhmodule): + conninfo = craft_conninfo(test_db, "new-db") + module_name = "test.cli" + # the command below will use sql scripts from + # swh/core/db/tests/data/cli/sql/*.sql + cli_runner.invoke(swhdb, ["init", module_name, "--dbname", conninfo]) + + """ + mock = mocker.MagicMock + + def import_swhmodule_mock(modname): + if modname.startswith("test."): + dirname = modname.split(".", 1)[1] + + def get_datastore(*args, **kw): + return mock(get_current_version=lambda: 42) + + return mock( + __name__=modname, + __file__=os.path.join(datadir, dirname, "__init__.py"), + get_datastore=get_datastore, + ) + else: + return import_swhmodule(modname) + + return mocker.patch("swh.core.db.db_utils.import_swhmodule", import_swhmodule_mock) diff --git a/swh/core/db/tests/data/cli/0-superuser-init.sql b/swh/core/db/tests/data/cli/sql/0-superuser-init.sql similarity index 100% copy from swh/core/db/tests/data/cli/0-superuser-init.sql copy to swh/core/db/tests/data/cli/sql/0-superuser-init.sql diff --git a/swh/core/db/tests/data/cli/1-schema.sql b/swh/core/db/tests/data/cli/sql/30-schema.sql similarity index 87% rename from swh/core/db/tests/data/cli/1-schema.sql rename to swh/core/db/tests/data/cli/sql/30-schema.sql index a5f6d2c..fcbd581 100644 --- a/swh/core/db/tests/data/cli/1-schema.sql +++ b/swh/core/db/tests/data/cli/sql/30-schema.sql @@ -1,13 +1,13 @@ -- schema version table which won't get truncated -create table if not exists dbversion ( +create table dbversion ( version int primary key, release timestamptz, description text ); -- origin table create table if not exists origin ( id bigserial not null, url text not null, hash text not null ); diff --git a/swh/core/db/tests/data/cli/3-func.sql b/swh/core/db/tests/data/cli/sql/40-funcs.sql similarity index 100% copy from swh/core/db/tests/data/cli/3-func.sql copy to swh/core/db/tests/data/cli/sql/40-funcs.sql diff --git a/swh/core/db/tests/data/cli/4-data.sql b/swh/core/db/tests/data/cli/sql/50-data.sql similarity index 73% copy from swh/core/db/tests/data/cli/4-data.sql copy to swh/core/db/tests/data/cli/sql/50-data.sql index ed29fa1..f6564e0 100644 --- a/swh/core/db/tests/data/cli/4-data.sql +++ b/swh/core/db/tests/data/cli/sql/50-data.sql @@ -1,5 +1,5 @@ insert into dbversion(version, release, description) -values (1, '2016-02-22 15:56:28.358587+00', 'Work In Progress'); +values (10, '2016-02-22 15:56:28.358587+00', 'Work In Progress'); insert into origin(url, hash) values ('https://forge.softwareheritage.org', hash_sha1('https://forge.softwareheritage.org')); diff --git a/swh/core/db/tests/data/cli/0-superuser-init.sql b/swh/core/db/tests/data/cli_new/sql/0-superuser-init.sql similarity index 100% rename from swh/core/db/tests/data/cli/0-superuser-init.sql rename to swh/core/db/tests/data/cli_new/sql/0-superuser-init.sql diff --git a/swh/core/db/tests/data/cli_new/sql/30-schema.sql b/swh/core/db/tests/data/cli_new/sql/30-schema.sql new file mode 100644 index 0000000..64289f7 --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/30-schema.sql @@ -0,0 +1,6 @@ +-- origin table +create table if not exists origin ( + id bigserial not null, + url text not null, + hash text not null +); diff --git a/swh/core/db/tests/data/cli/3-func.sql b/swh/core/db/tests/data/cli_new/sql/40-funcs.sql similarity index 100% rename from swh/core/db/tests/data/cli/3-func.sql rename to swh/core/db/tests/data/cli_new/sql/40-funcs.sql diff --git a/swh/core/db/tests/data/cli/4-data.sql b/swh/core/db/tests/data/cli_new/sql/50-data.sql similarity index 51% rename from swh/core/db/tests/data/cli/4-data.sql rename to swh/core/db/tests/data/cli_new/sql/50-data.sql index ed29fa1..a0120a6 100644 --- a/swh/core/db/tests/data/cli/4-data.sql +++ b/swh/core/db/tests/data/cli_new/sql/50-data.sql @@ -1,5 +1,2 @@ -insert into dbversion(version, release, description) -values (1, '2016-02-22 15:56:28.358587+00', 'Work In Progress'); - insert into origin(url, hash) values ('https://forge.softwareheritage.org', hash_sha1('https://forge.softwareheritage.org')); diff --git a/swh/core/db/tests/data/cli_new/sql/upgrades/001.sql b/swh/core/db/tests/data/cli_new/sql/upgrades/001.sql new file mode 100644 index 0000000..d914414 --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/upgrades/001.sql @@ -0,0 +1,5 @@ +-- this script should never be executed by an upgrade procedure (because +-- version 1 is set by 'swh db init') + +insert into origin(url, hash) +values ('this should never be executed', hash_sha1('')); diff --git a/swh/core/db/tests/data/cli_new/sql/upgrades/002.sql b/swh/core/db/tests/data/cli_new/sql/upgrades/002.sql new file mode 100644 index 0000000..5f12b9e --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/upgrades/002.sql @@ -0,0 +1,4 @@ +-- + +insert into origin(url, hash) +values ('version002', hash_sha1('version002')); diff --git a/swh/core/db/tests/data/cli_new/sql/upgrades/003.sql b/swh/core/db/tests/data/cli_new/sql/upgrades/003.sql new file mode 100644 index 0000000..87ac9e1 --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/upgrades/003.sql @@ -0,0 +1,4 @@ +-- + +insert into origin(url, hash) +values ('version003', hash_sha1('version003')); diff --git a/swh/core/db/tests/data/cli_new/sql/upgrades/004.sql b/swh/core/db/tests/data/cli_new/sql/upgrades/004.sql new file mode 100644 index 0000000..d1f03da --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/upgrades/004.sql @@ -0,0 +1,4 @@ +-- + +insert into origin(url, hash) +values ('version004', hash_sha1('version004')); diff --git a/swh/core/db/tests/data/cli_new/sql/upgrades/005.sql b/swh/core/db/tests/data/cli_new/sql/upgrades/005.sql new file mode 100644 index 0000000..8d0db9e --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/upgrades/005.sql @@ -0,0 +1,4 @@ +-- + +insert into origin(url, hash) +values ('version005', hash_sha1('version005')); diff --git a/swh/core/db/tests/data/cli_new/sql/upgrades/006.sql b/swh/core/db/tests/data/cli_new/sql/upgrades/006.sql new file mode 100644 index 0000000..115b59f --- /dev/null +++ b/swh/core/db/tests/data/cli_new/sql/upgrades/006.sql @@ -0,0 +1,7 @@ +-- + +insert into origin(url, hash) +values ('version006', hash_sha1('version006')); + +insert into dbversion(version, release, description) +values (6, 'NOW()', 'Updated version from upgrade script'); diff --git a/swh/core/db/tests/pytest_plugin/test_pytest_plugin.py b/swh/core/db/tests/pytest_plugin/test_pytest_plugin.py index 84b0039..67a3fb5 100644 --- a/swh/core/db/tests/pytest_plugin/test_pytest_plugin.py +++ b/swh/core/db/tests/pytest_plugin/test_pytest_plugin.py @@ -1,173 +1,185 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob import os +from pytest_postgresql import factories + from swh.core.db import BaseDb -from swh.core.db.pytest_plugin import postgresql_fact +from swh.core.db.pytest_plugin import gen_dump_files, postgresql_fact SQL_DIR = os.path.join(os.path.dirname(__file__), "data") +test_postgresql_proc = factories.postgresql_proc( + dbname="fun", + load=sorted(glob.glob(f"{SQL_DIR}/*.sql")), # type: ignore[arg-type] + # type ignored because load is typed as Optional[List[...]] instead of an + # Optional[Sequence[...]] in pytest_postgresql<4 +) # db with special policy for tables dbversion and people postgres_fun = postgresql_fact( - "postgresql_proc", - dbname="fun", - dump_files=f"{SQL_DIR}/*.sql", - no_truncate_tables={"dbversion", "people"}, + "test_postgresql_proc", no_db_drop=True, no_truncate_tables={"dbversion", "people"}, ) postgres_fun2 = postgresql_fact( - "postgresql_proc", + "test_postgresql_proc", dbname="fun2", - dump_files=sorted(glob.glob(f"{SQL_DIR}/*.sql")), + load=sorted(glob.glob(f"{SQL_DIR}/*.sql")), no_truncate_tables={"dbversion", "people"}, + no_db_drop=True, ) def test_smoke_test_fun_db_is_up(postgres_fun): """This ensures the db is created and configured according to its dumps files. """ with BaseDb.connect(postgres_fun.dsn).cursor() as cur: cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 3 cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 # in data, we requested a value already so it starts at 2 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 2 def test_smoke_test_fun2_db_is_up(postgres_fun2): """This ensures the db is created and configured according to its dumps files. """ with BaseDb.connect(postgres_fun2.dsn).cursor() as cur: cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 3 cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 # in data, we requested a value already so it starts at 2 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 2 def test_smoke_test_fun_db_is_still_up_and_got_reset(postgres_fun): """This ensures that within another tests, the 'fun' db is still up, created (and not configured again). This time, most of the data has been reset: - except for tables 'dbversion' and 'people' which were left as is - the other tables from the schema (here only "fun") got truncated - the sequences got truncated as well """ with BaseDb.connect(postgres_fun.dsn).cursor() as cur: # db version is excluded from the truncate cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 # people is also allowed not to be truncated cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 # table and sequence are reset cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 0 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 1 # db with no special policy for tables truncation, all tables are reset postgres_people = postgresql_fact( "postgresql_proc", dbname="people", dump_files=f"{SQL_DIR}/*.sql", no_truncate_tables=set(), + no_db_drop=True, ) +def test_gen_dump_files(): + files = [os.path.basename(fn) for fn in gen_dump_files(f"{SQL_DIR}/*.sql")] + assert files == ["0-schema.sql", "1-data.sql"] + + def test_smoke_test_people_db_up(postgres_people): """'people' db is up and configured """ with BaseDb.connect(postgres_people.dsn).cursor() as cur: cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 5 cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 2 cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 3 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 2 def test_smoke_test_people_db_up_and_reset(postgres_people): """'people' db is up and got reset on every tables and sequences """ with BaseDb.connect(postgres_people.dsn).cursor() as cur: # tables are truncated after the first round cur.execute("select count(*) from dbversion") nb_rows = cur.fetchone()[0] assert nb_rows == 0 # tables are truncated after the first round cur.execute("select count(*) from people") nb_rows = cur.fetchone()[0] assert nb_rows == 0 # table and sequence are reset cur.execute("select count(*) from fun") nb_rows = cur.fetchone()[0] assert nb_rows == 0 cur.execute("select nextval('serial')") val = cur.fetchone()[0] assert val == 1 # db with no initialization step, an empty db postgres_no_init = postgresql_fact("postgresql_proc", dbname="something") def test_smoke_test_db_no_init(postgres_no_init): """We can connect to the db nonetheless """ with BaseDb.connect(postgres_no_init.dsn).cursor() as cur: cur.execute("select now()") data = cur.fetchone()[0] assert data is not None diff --git a/swh/core/db/tests/test_cli.py b/swh/core/db/tests/test_cli.py index b3fc568..3164ffa 100644 --- a/swh/core/db/tests/test_cli.py +++ b/swh/core/db/tests/test_cli.py @@ -1,240 +1,350 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy -import glob -from os import path +import os +import traceback -from click.testing import CliRunner import pytest +import yaml from swh.core.cli.db import db as swhdb from swh.core.db import BaseDb -from swh.core.db.pytest_plugin import postgresql_fact +from swh.core.db.db_utils import import_swhmodule, swh_db_module, swh_db_version from swh.core.tests.test_cli import assert_section_contains -@pytest.fixture -def cli_runner(): - return CliRunner() - - def test_cli_swh_help(swhmain, cli_runner): swhmain.add_command(swhdb) result = cli_runner.invoke(swhmain, ["-h"]) assert result.exit_code == 0 assert_section_contains( result.output, "Commands", "db Software Heritage database generic tools." ) help_db_snippets = ( ( "Usage", ( "Usage: swh db [OPTIONS] COMMAND [ARGS]...", "Software Heritage database generic tools.", ), ), ( "Commands", ( "create Create a database for the Software Heritage .", "init Initialize a database for the Software Heritage .", "init-admin Execute superuser-level initialization steps", ), ), ) def test_cli_swh_db_help(swhmain, cli_runner): swhmain.add_command(swhdb) result = cli_runner.invoke(swhmain, ["db", "-h"]) assert result.exit_code == 0 for section, snippets in help_db_snippets: for snippet in snippets: assert_section_contains(result.output, section, snippet) -@pytest.fixture() -def mock_package_sql(mocker, datadir): - """This bypasses the module manipulation to only returns the data test files. - - """ - from swh.core.utils import numfile_sortkey as sortkey - - mock_sql_files = mocker.patch("swh.core.cli.db.get_sql_for_package") - sql_files = sorted(glob.glob(path.join(datadir, "cli", "*.sql")), key=sortkey) - mock_sql_files.return_value = sql_files - return mock_sql_files - - -# We do not want the truncate behavior for those tests -test_db = postgresql_fact( - "postgresql_proc", dbname="clidb", no_truncate_tables={"dbversion", "origin"} -) - - @pytest.fixture -def swh_db_cli(cli_runner, monkeypatch, test_db): +def swh_db_cli(cli_runner, monkeypatch, postgresql): """This initializes a cli_runner and sets the correct environment variable expected by the cli to run appropriately (when not specifying the --dbname flag) """ - db_params = test_db.get_dsn_parameters() + db_params = postgresql.get_dsn_parameters() monkeypatch.setenv("PGHOST", db_params["host"]) monkeypatch.setenv("PGUSER", db_params["user"]) monkeypatch.setenv("PGPORT", db_params["port"]) return cli_runner, db_params def craft_conninfo(test_db, dbname=None) -> str: """Craft conninfo string out of the test_db object. This also allows to override the dbname.""" db_params = test_db.get_dsn_parameters() if dbname: params = copy.deepcopy(db_params) params["dbname"] = dbname else: params = db_params return "postgresql://{user}@{host}:{port}/{dbname}".format(**params) -def test_cli_swh_db_create_and_init_db(cli_runner, test_db, mock_package_sql): +def test_cli_swh_db_create_and_init_db(cli_runner, postgresql, mock_import_swhmodule): """Create a db then initializing it should be ok """ - module_name = "something" + module_name = "test.cli" - conninfo = craft_conninfo(test_db, "new-db") + conninfo = craft_conninfo(postgresql, "new-db") # This creates the db and installs the necessary admin extensions result = cli_runner.invoke(swhdb, ["create", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # This initializes the schema and data result = cli_runner.invoke(swhdb, ["init", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin value in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, installed during db creation step) with BaseDb.connect(conninfo).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1 def test_cli_swh_db_initialization_fail_without_creation_first( - cli_runner, test_db, mock_package_sql + cli_runner, postgresql, mock_import_swhmodule ): """Init command on an inexisting db cannot work """ - module_name = "anything" # it's mocked here - conninfo = craft_conninfo(test_db, "inexisting-db") + module_name = "test.cli" # it's mocked here + conninfo = craft_conninfo(postgresql, "inexisting-db") result = cli_runner.invoke(swhdb, ["init", module_name, "--dbname", conninfo]) # Fails because we cannot connect to an inexisting db assert result.exit_code == 1, f"Unexpected output: {result.output}" def test_cli_swh_db_initialization_fail_without_extension( - cli_runner, test_db, mock_package_sql + cli_runner, postgresql, mock_import_swhmodule ): """Init command cannot work without privileged extension. In this test, the schema needs privileged extension to work. """ - module_name = "anything" # it's mocked here - conninfo = craft_conninfo(test_db) + module_name = "test.cli" # it's mocked here + conninfo = craft_conninfo(postgresql) result = cli_runner.invoke(swhdb, ["init", module_name, "--dbname", conninfo]) # Fails as the function `public.digest` is not installed, init-admin calls is needed # first (the next tests show such behavior) assert result.exit_code == 1, f"Unexpected output: {result.output}" def test_cli_swh_db_initialization_works_with_flags( - cli_runner, test_db, mock_package_sql + cli_runner, postgresql, mock_import_swhmodule ): """Init commands with carefully crafted libpq conninfo works """ - module_name = "anything" # it's mocked here - conninfo = craft_conninfo(test_db) + module_name = "test.cli" # it's mocked here + conninfo = craft_conninfo(postgresql) result = cli_runner.invoke(swhdb, ["init-admin", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke(swhdb, ["init", module_name, "--dbname", conninfo]) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin values in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, init-admin calls installs it) - with BaseDb.connect(test_db.dsn).cursor() as cur: + with BaseDb.connect(postgresql.dsn).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1 -def test_cli_swh_db_initialization_with_env(swh_db_cli, mock_package_sql, test_db): +def test_cli_swh_db_initialization_with_env( + swh_db_cli, mock_import_swhmodule, postgresql +): """Init commands with standard environment variables works """ - module_name = "anything" # it's mocked here + module_name = "test.cli" # it's mocked here cli_runner, db_params = swh_db_cli result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module_name, "--dbname", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin values in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, init-admin calls installs it) - with BaseDb.connect(test_db.dsn).cursor() as cur: + with BaseDb.connect(postgresql.dsn).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1 -def test_cli_swh_db_initialization_idempotent(swh_db_cli, mock_package_sql, test_db): +def test_cli_swh_db_initialization_idempotent( + swh_db_cli, mock_import_swhmodule, postgresql +): """Multiple runs of the init commands are idempotent """ - module_name = "anything" # mocked + module_name = "test.cli" # mocked cli_runner, db_params = swh_db_cli result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module_name, "--dbname", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init-admin", module_name, "--dbname", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" result = cli_runner.invoke( swhdb, ["init", module_name, "--dbname", db_params["dbname"]] ) assert result.exit_code == 0, f"Unexpected output: {result.output}" # the origin values in the scripts uses a hash function (which implementation wise # uses a function from the pgcrypt extension, init-admin calls installs it) - with BaseDb.connect(test_db.dsn).cursor() as cur: + with BaseDb.connect(postgresql.dsn).cursor() as cur: cur.execute("select * from origin") origins = cur.fetchall() assert len(origins) == 1 + + +def test_cli_swh_db_create_and_init_db_new_api( + cli_runner, postgresql, mock_import_swhmodule, mocker, tmp_path +): + """Create a db then initializing it should be ok for a "new style" datastore + + """ + module_name = "test.cli_new" + + conninfo = craft_conninfo(postgresql) + + # This initializes the schema and data + cfgfile = tmp_path / "config.yml" + cfgfile.write_text(yaml.dump({module_name: {"cls": "postgresql", "db": conninfo}})) + result = cli_runner.invoke(swhdb, ["init-admin", module_name, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + result = cli_runner.invoke(swhdb, ["-C", cfgfile, "init", module_name]) + + assert ( + result.exit_code == 0 + ), f"Unexpected output: {traceback.print_tb(result.exc_info[2])}" + + # the origin value in the scripts uses a hash function (which implementation wise + # uses a function from the pgcrypt extension, installed during db creation step) + with BaseDb.connect(conninfo).cursor() as cur: + cur.execute("select * from origin") + origins = cur.fetchall() + assert len(origins) == 1 + + +def test_cli_swh_db_upgrade_new_api(cli_runner, postgresql, datadir, mocker, tmp_path): + """Upgrade scenario for a "new style" datastore + + """ + module_name = "test.cli_new" + + # the `current_version` variable is the version that will be returned by + # any call to `get_current_version()` in this test session, thanks to the + # local mocked version of import_swhmodule() below. + current_version = 1 + + # custom version of the mockup to make it easy to change the + # current_version returned by get_current_version() + # TODO: find a better solution for this... + def import_swhmodule_mock(modname): + if modname.startswith("test."): + dirname = modname.split(".", 1)[1] + + def get_datastore(cls, **kw): + return mocker.MagicMock(get_current_version=lambda: current_version) + + return mocker.MagicMock( + __name__=modname, + __file__=os.path.join(datadir, dirname, "__init__.py"), + name=modname, + get_datastore=get_datastore, + ) + + return import_swhmodule(modname) + + mocker.patch("swh.core.db.db_utils.import_swhmodule", import_swhmodule_mock) + conninfo = craft_conninfo(postgresql) + + # This initializes the schema and data + cfgfile = tmp_path / "config.yml" + cfgfile.write_text(yaml.dump({module_name: {"cls": "postgresql", "db": conninfo}})) + result = cli_runner.invoke(swhdb, ["init-admin", module_name, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + result = cli_runner.invoke(swhdb, ["-C", cfgfile, "init", module_name]) + + assert ( + result.exit_code == 0 + ), f"Unexpected output: {traceback.print_tb(result.exc_info[2])}" + + assert swh_db_version(conninfo) == 1 + + # the upgrade should not do anything because the datastore does advertise + # version 1 + result = cli_runner.invoke(swhdb, ["-C", cfgfile, "upgrade", module_name]) + assert swh_db_version(conninfo) == 1 + + # advertise current version as 3, a simple upgrade should get us there, but + # no further + current_version = 3 + result = cli_runner.invoke(swhdb, ["-C", cfgfile, "upgrade", module_name]) + assert swh_db_version(conninfo) == 3 + + # an attempt to go further should not do anything + result = cli_runner.invoke( + swhdb, ["-C", cfgfile, "upgrade", module_name, "--to-version", 5] + ) + assert swh_db_version(conninfo) == 3 + # an attempt to go lower should not do anything + result = cli_runner.invoke( + swhdb, ["-C", cfgfile, "upgrade", module_name, "--to-version", 2] + ) + assert swh_db_version(conninfo) == 3 + + # advertise current version as 6, an upgrade with --to-version 4 should + # stick to the given version 4 and no further + current_version = 6 + result = cli_runner.invoke( + swhdb, ["-C", cfgfile, "upgrade", module_name, "--to-version", 4] + ) + assert swh_db_version(conninfo) == 4 + assert "migration was not complete" in result.output + + # attempt to upgrade to a newer version than current code version fails + result = cli_runner.invoke( + swhdb, + ["-C", cfgfile, "upgrade", module_name, "--to-version", current_version + 1], + ) + assert result.exit_code != 0 + assert swh_db_version(conninfo) == 4 + + cnx = BaseDb.connect(conninfo) + with cnx.transaction() as cur: + cur.execute("drop table dbmodule") + assert swh_db_module(conninfo) is None + + # db migration should recreate the missing dbmodule table + result = cli_runner.invoke(swhdb, ["-C", cfgfile, "upgrade", module_name]) + assert result.exit_code == 0 + assert "Warning: the database does not have a dbmodule table." in result.output + assert ( + "Write the module information (test.cli_new) in the database? [Y/n]" + in result.output + ) + assert swh_db_module(conninfo) == module_name diff --git a/swh/core/db/tests/test_db_utils.py b/swh/core/db/tests/test_db_utils.py new file mode 100644 index 0000000..7e2719d --- /dev/null +++ b/swh/core/db/tests/test_db_utils.py @@ -0,0 +1,189 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timedelta +from os import path + +import pytest + +from swh.core.cli.db import db as swhdb +from swh.core.db import BaseDb +from swh.core.db.db_utils import ( + get_database_info, + get_sql_for_package, + now, + swh_db_module, + swh_db_upgrade, + swh_db_version, + swh_db_versions, + swh_set_db_module, +) + +from .test_cli import craft_conninfo + + +@pytest.mark.parametrize("module", ["test.cli", "test.cli_new"]) +def test_get_sql_for_package(mock_import_swhmodule, module): + files = get_sql_for_package(module) + assert files + assert [f.name for f in files] == [ + "0-superuser-init.sql", + "30-schema.sql", + "40-funcs.sql", + "50-data.sql", + ] + + +@pytest.mark.parametrize("module", ["test.cli", "test.cli_new"]) +def test_db_utils_versions(cli_runner, postgresql, mock_import_swhmodule, module): + """Check get_database_info, swh_db_versions and swh_db_module work ok + + This test checks db versions for both a db with "new style" set of sql init + scripts (i.e. the dbversion table is not created in these scripts, but by + the populate_database_for_package() function directly, via the 'swh db + init' command) and an "old style" set (dbversion created in the scripts)S. + + """ + conninfo = craft_conninfo(postgresql) + result = cli_runner.invoke(swhdb, ["init-admin", module, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + result = cli_runner.invoke( + swhdb, ["init", module, "--dbname", conninfo, "--initial-version", 10] + ) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + + # check the swh_db_module() function + assert swh_db_module(conninfo) == module + + # the dbversion and dbmodule tables exists and are populated + dbmodule, dbversion, dbflavor = get_database_info(conninfo) + # check also the swh_db_versions() function + versions = swh_db_versions(conninfo) + + assert dbmodule == module + assert dbversion == 10 + assert dbflavor is None + # check also the swh_db_versions() function + versions = swh_db_versions(conninfo) + assert len(versions) == 1 + assert versions[0][0] == 10 + if module == "test.cli": + assert versions[0][1] == datetime.fromisoformat( + "2016-02-22T15:56:28.358587+00:00" + ) + assert versions[0][2] == "Work In Progress" + else: + # new scheme but with no datastore (so no version support from there) + assert versions[0][2] == "DB initialization" + + # add a few versions in dbversion + cnx = BaseDb.connect(conninfo) + with cnx.transaction() as cur: + cur.executemany( + "insert into dbversion(version, release, description) values (%s, %s, %s)", + [(i, now(), f"Upgrade to version {i}") for i in range(11, 15)], + ) + + dbmodule, dbversion, dbflavor = get_database_info(conninfo) + assert dbmodule == module + assert dbversion == 14 + assert dbflavor is None + + versions = swh_db_versions(conninfo) + assert len(versions) == 5 + for i, (version, ts, desc) in enumerate(versions): + assert version == (14 - i) # these are in reverse order + if version > 10: + assert desc == f"Upgrade to version {version}" + assert (now() - ts) < timedelta(seconds=1) + + +@pytest.mark.parametrize("module", ["test.cli_new"]) +def test_db_utils_upgrade( + cli_runner, postgresql, mock_import_swhmodule, module, datadir +): + """Check swh_db_upgrade + + """ + conninfo = craft_conninfo(postgresql) + result = cli_runner.invoke(swhdb, ["init-admin", module, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + result = cli_runner.invoke(swhdb, ["init", module, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + + assert swh_db_version(conninfo) == 1 + new_version = swh_db_upgrade(conninfo, module) + assert new_version == 6 + assert swh_db_version(conninfo) == 6 + + versions = swh_db_versions(conninfo) + # get rid of dates to ease checking + versions = [(v[0], v[2]) for v in versions] + assert versions[-1] == (1, "DB initialization") + sqlbasedir = path.join(datadir, module.split(".", 1)[1], "sql", "upgrades") + + assert versions[1:-1] == [ + (i, f"Upgraded to version {i} using {sqlbasedir}/{i:03d}.sql") + for i in range(5, 1, -1) + ] + assert versions[0] == (6, "Updated version from upgrade script") + + cnx = BaseDb.connect(conninfo) + with cnx.transaction() as cur: + cur.execute("select url from origin where url like 'version%'") + result = cur.fetchall() + assert result == [("version%03d" % i,) for i in range(2, 7)] + cur.execute( + "select url from origin where url = 'this should never be executed'" + ) + result = cur.fetchall() + assert not result + + +@pytest.mark.parametrize("module", ["test.cli_new"]) +def test_db_utils_swh_db_upgrade_sanity_checks( + cli_runner, postgresql, mock_import_swhmodule, module, datadir +): + """Check swh_db_upgrade + + """ + conninfo = craft_conninfo(postgresql) + result = cli_runner.invoke(swhdb, ["init-admin", module, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + result = cli_runner.invoke(swhdb, ["init", module, "--dbname", conninfo]) + assert result.exit_code == 0, f"Unexpected output: {result.output}" + + cnx = BaseDb.connect(conninfo) + with cnx.transaction() as cur: + cur.execute("drop table dbmodule") + + # try to upgrade with a unset module + with pytest.raises(ValueError): + swh_db_upgrade(conninfo, module) + + # check the dbmodule is unset + assert swh_db_module(conninfo) is None + + # set the stored module to something else + swh_set_db_module(conninfo, f"{module}2") + assert swh_db_module(conninfo) == f"{module}2" + + # try to upgrade with a different module + with pytest.raises(ValueError): + swh_db_upgrade(conninfo, module) + + # revert to the proper module in the db + swh_set_db_module(conninfo, module, force=True) + assert swh_db_module(conninfo) == module + # trying again is a noop + swh_set_db_module(conninfo, module) + assert swh_db_module(conninfo) == module + + # drop the dbversion table + with cnx.transaction() as cur: + cur.execute("drop table dbversion") + # an upgrade should fail due to missing stored version + with pytest.raises(ValueError): + swh_db_upgrade(conninfo, module) diff --git a/swh/core/sql/log-schema.sql b/swh/core/sql/log-schema.sql deleted file mode 100644 index d8dd5ec..0000000 --- a/swh/core/sql/log-schema.sql +++ /dev/null @@ -1,33 +0,0 @@ ---- ---- logging data model ---- - -create table dbversion -( - version int primary key, - release timestamptz, - description text -); - -insert into dbversion(version, release, description) - values(1, now(), 'Work In Progress'); - - -create type log_level as enum ('debug', 'info', 'warning', 'error', 'critical'); - -create table log -( - id bigserial primary key, - ts timestamptz not null default now(), - level log_level not null default 'info', -- importance - message text not null, -- human readable message - data jsonb, -- extra data; when NOT NULL, must contain a key "type" - -- denoting the kind of message within src_module - src_module text, -- fully-qualified source module, e.g., "swh.loader.git" - src_host text, -- FQDN source hostname, e.g., "worker03.softwareheritage.org" - src_pid int -- originating PID, relative to src_host -); - -create index on log (ts); -create index on log (src_module); -create index on log (src_host); diff --git a/swh/core/tests/test_utils.py b/swh/core/tests/test_utils.py index f84c34a..1933d38 100644 --- a/swh/core/tests/test_utils.py +++ b/swh/core/tests/test_utils.py @@ -1,134 +1,138 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest +import pytest from swh.core import utils -class UtilsLib(unittest.TestCase): - def test_grouper(self): - # given - actual_data = utils.grouper((i for i in range(0, 9)), 2) +def test_grouper(): + # given + actual_data = utils.grouper((i for i in range(0, 9)), 2) - out = [] - for d in actual_data: - out.append(list(d)) # force generator resolution for checks + out = [] + for d in actual_data: + out.append(list(d)) # force generator resolution for checks - self.assertEqual(out, [[0, 1], [2, 3], [4, 5], [6, 7], [8]]) + assert out == [[0, 1], [2, 3], [4, 5], [6, 7], [8]] - # given - actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) + # given + actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) - out = [] - for d in actual_data: - out.append(list(d)) # force generator resolution for checks + out = [] + for d in actual_data: + out.append(list(d)) # force generator resolution for checks - self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]]) + assert out == [[9, 8, 7, 6], [5, 4, 3, 2], [1]] - def test_grouper_with_stop_value(self): - # given - actual_data = utils.grouper(((i, i + 1) for i in range(0, 9)), 2) - out = [] - for d in actual_data: - out.append(list(d)) # force generator resolution for checks +def test_grouper_with_stop_value(): + # given + actual_data = utils.grouper(((i, i + 1) for i in range(0, 9)), 2) - self.assertEqual( - out, - [ - [(0, 1), (1, 2)], - [(2, 3), (3, 4)], - [(4, 5), (5, 6)], - [(6, 7), (7, 8)], - [(8, 9)], - ], - ) + out = [] + for d in actual_data: + out.append(list(d)) # force generator resolution for checks - # given - actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) + assert out == [ + [(0, 1), (1, 2)], + [(2, 3), (3, 4)], + [(4, 5), (5, 6)], + [(6, 7), (7, 8)], + [(8, 9)], + ] - out = [] - for d in actual_data: - out.append(list(d)) # force generator resolution for checks + # given + actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) - self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]]) + out = [] + for d in actual_data: + out.append(list(d)) # force generator resolution for checks - def test_backslashescape_errors(self): - raw_data_err = b"abcd\x80" - with self.assertRaises(UnicodeDecodeError): - raw_data_err.decode("utf-8", "strict") + assert out == [[9, 8, 7, 6], [5, 4, 3, 2], [1]] - self.assertEqual( - raw_data_err.decode("utf-8", "backslashescape"), "abcd\\x80", - ) - raw_data_ok = b"abcd\xc3\xa9" - self.assertEqual( - raw_data_ok.decode("utf-8", "backslashescape"), - raw_data_ok.decode("utf-8", "strict"), - ) +def test_backslashescape_errors(): + raw_data_err = b"abcd\x80" + with pytest.raises(UnicodeDecodeError): + raw_data_err.decode("utf-8", "strict") - unicode_data = "abcdef\u00a3" - self.assertEqual( - unicode_data.encode("ascii", "backslashescape"), b"abcdef\\xa3", - ) + assert raw_data_err.decode("utf-8", "backslashescape") == "abcd\\x80" - def test_encode_with_unescape(self): - valid_data = "\\x01020304\\x00" - valid_data_encoded = b"\x01020304\x00" + raw_data_ok = b"abcd\xc3\xa9" + assert raw_data_ok.decode("utf-8", "backslashescape") == raw_data_ok.decode( + "utf-8", "strict" + ) - self.assertEqual(valid_data_encoded, utils.encode_with_unescape(valid_data)) + unicode_data = "abcdef\u00a3" + assert unicode_data.encode("ascii", "backslashescape") == b"abcdef\\xa3" - def test_encode_with_unescape_invalid_escape(self): - invalid_data = "test\\abcd" - with self.assertRaises(ValueError) as exc: - utils.encode_with_unescape(invalid_data) +def test_encode_with_unescape(): + valid_data = "\\x01020304\\x00" + valid_data_encoded = b"\x01020304\x00" - self.assertIn("invalid escape", exc.exception.args[0]) - self.assertIn("position 4", exc.exception.args[0]) + assert valid_data_encoded == utils.encode_with_unescape(valid_data) - def test_decode_with_escape(self): - backslashes = b"foo\\bar\\\\baz" - backslashes_escaped = "foo\\\\bar\\\\\\\\baz" - self.assertEqual( - backslashes_escaped, utils.decode_with_escape(backslashes), - ) +def test_encode_with_unescape_invalid_escape(): + invalid_data = "test\\abcd" - valid_utf8 = b"foo\xc3\xa2" - valid_utf8_escaped = "foo\u00e2" + with pytest.raises(ValueError) as exc: + utils.encode_with_unescape(invalid_data) - self.assertEqual( - valid_utf8_escaped, utils.decode_with_escape(valid_utf8), - ) + assert "invalid escape" in exc.value.args[0] + assert "position 4" in exc.value.args[0] - invalid_utf8 = b"foo\xa2" - invalid_utf8_escaped = "foo\\xa2" - - self.assertEqual( - invalid_utf8_escaped, utils.decode_with_escape(invalid_utf8), - ) - valid_utf8_nul = b"foo\xc3\xa2\x00" - valid_utf8_nul_escaped = "foo\u00e2\\x00" +def test_decode_with_escape(): + backslashes = b"foo\\bar\\\\baz" + backslashes_escaped = "foo\\\\bar\\\\\\\\baz" - self.assertEqual( - valid_utf8_nul_escaped, utils.decode_with_escape(valid_utf8_nul), - ) - - def test_commonname(self): - # when - actual_commonname = utils.commonname("/some/where/to/", "/some/where/to/go/to") - # then - self.assertEqual("go/to", actual_commonname) + assert backslashes_escaped == utils.decode_with_escape(backslashes) - # when - actual_commonname2 = utils.commonname( - b"/some/where/to/", b"/some/where/to/go/to" - ) - # then - self.assertEqual(b"go/to", actual_commonname2) + valid_utf8 = b"foo\xc3\xa2" + valid_utf8_escaped = "foo\u00e2" + + assert valid_utf8_escaped == utils.decode_with_escape(valid_utf8) + + invalid_utf8 = b"foo\xa2" + invalid_utf8_escaped = "foo\\xa2" + + assert invalid_utf8_escaped == utils.decode_with_escape(invalid_utf8) + + valid_utf8_nul = b"foo\xc3\xa2\x00" + valid_utf8_nul_escaped = "foo\u00e2\\x00" + + assert valid_utf8_nul_escaped == utils.decode_with_escape(valid_utf8_nul) + + +def test_commonname(): + # when + actual_commonname = utils.commonname("/some/where/to/", "/some/where/to/go/to") + # then + assert "go/to" == actual_commonname + + # when + actual_commonname2 = utils.commonname(b"/some/where/to/", b"/some/where/to/go/to") + # then + assert b"go/to" == actual_commonname2 + + +def test_numfile_sotkey(): + assert utils.numfile_sortkey("00-xxx.sql") == (0, "-xxx.sql") + assert utils.numfile_sortkey("01-xxx.sql") == (1, "-xxx.sql") + assert utils.numfile_sortkey("10-xxx.sql") == (10, "-xxx.sql") + assert utils.numfile_sortkey("99-xxx.sql") == (99, "-xxx.sql") + assert utils.numfile_sortkey("100-xxx.sql") == (100, "-xxx.sql") + assert utils.numfile_sortkey("00100-xxx.sql") == (100, "-xxx.sql") + assert utils.numfile_sortkey("1.sql") == (1, ".sql") + assert utils.numfile_sortkey("1") == (1, "") + assert utils.numfile_sortkey("toto-01.sql") == (999999, "toto-01.sql") + + +def test_basename_sotkey(): + assert utils.basename_sortkey("00-xxx.sql") == (0, "-xxx.sql") + assert utils.basename_sortkey("path/to/00-xxx.sql") == (0, "-xxx.sql") diff --git a/swh/core/utils.py b/swh/core/utils.py index a14daa5..79f41cd 100644 --- a/swh/core/utils.py +++ b/swh/core/utils.py @@ -1,122 +1,137 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import codecs from contextlib import contextmanager import itertools import os import re +from typing import Tuple @contextmanager def cwd(path): """Contextually change the working directory to do thy bidding. Then gets back to the original location. """ prev_cwd = os.getcwd() os.chdir(path) try: yield finally: os.chdir(prev_cwd) def grouper(iterable, n): """Collect data into fixed-length size iterables. The last block might contain less elements as it will hold only the remaining number of elements. The invariant here is that the number of elements in the input iterable and the sum of the number of elements of all iterables generated from this function should be equal. Args: iterable (Iterable): an iterable n (int): size of block to slice the iterable into Yields: fixed-length blocks as iterables. As mentioned, the last iterable might be less populated. """ args = [iter(iterable)] * n stop_value = object() for _data in itertools.zip_longest(*args, fillvalue=stop_value): yield (d for d in _data if d is not stop_value) def backslashescape_errors(exception): if isinstance(exception, UnicodeDecodeError): bad_data = exception.object[exception.start : exception.end] escaped = "".join(r"\x%02x" % x for x in bad_data) return escaped, exception.end return codecs.backslashreplace_errors(exception) codecs.register_error("backslashescape", backslashescape_errors) def encode_with_unescape(value): """Encode an unicode string containing \\x backslash escapes""" slices = [] start = 0 odd_backslashes = False i = 0 while i < len(value): if value[i] == "\\": odd_backslashes = not odd_backslashes else: if odd_backslashes: if value[i] != "x": raise ValueError( "invalid escape for %r at position %d" % (value, i - 1) ) slices.append( value[start : i - 1].replace("\\\\", "\\").encode("utf-8") ) slices.append(bytes.fromhex(value[i + 1 : i + 3])) odd_backslashes = False start = i = i + 3 continue i += 1 slices.append(value[start:i].replace("\\\\", "\\").encode("utf-8")) return b"".join(slices) def decode_with_escape(value): """Decode a bytestring as utf-8, escaping the bytes of invalid utf-8 sequences as \\x. We also escape NUL bytes as they are invalid in JSON strings. """ # escape backslashes value = value.replace(b"\\", b"\\\\") value = value.replace(b"\x00", b"\\x00") return value.decode("utf-8", "backslashescape") def commonname(path0, path1, as_str=False): """Compute the commonname between the path0 and path1. """ return path1.split(path0)[1] -def numfile_sortkey(fname): +def numfile_sortkey(fname: str) -> Tuple[int, str]: """Simple function to sort filenames of the form: nnxxx.ext where nn is a number according to the numbers. + Returns a tuple (order, remaining), where 'order' is the numeric (int) + value extracted from the file name, and 'remaining' is the remaining part + of the file name. + Typically used to sort sql/nn-swh-xxx.sql files. + + Unmatched file names will return 999999 as order value. + """ - num, rem = re.match(r"(\d*)(.*)", fname).groups() - return (num and int(num) or 99, rem) + m = re.match(r"(\d*)(.*)", fname) + assert m is not None + num, rem = m.groups() + return (int(num) if num else 999999, rem) + + +def basename_sortkey(fname: str) -> Tuple[int, str]: + "like numfile_sortkey but on basenames" + return numfile_sortkey(os.path.basename(fname))