diff --git a/PKG-INFO b/PKG-INFO index 1b7d378..abd445b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,93 +1,93 @@ Metadata-Version: 2.1 Name: swh.core -Version: 0.4.0 +Version: 0.5.0 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-core/ Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism - database connection - http-based RPC client/server Development ----------- We strongly recommend you to use a [virtualenv][1] if you want to run tests or hack the code. To set up your development environment: ``` (swh) user@host:~/swh-environment/swh-core$ pip install -e .[testing] ``` This will install every Python package needed to run this package's tests. Unit tests can be executed using [pytest][2] or [tox][3]. ``` (swh) user@host:~/swh-environment/swh-core$ pytest ============================== test session starts ============================== platform linux -- Python 3.7.3, pytest-3.10.1, py-1.8.0, pluggy-0.12.0 hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/ddouard/src/swh-environment/swh-core/.hypothesis/examples') rootdir: /home/ddouard/src/swh-environment/swh-core, inifile: pytest.ini plugins: requests-mock-1.6.0, hypothesis-4.26.4, celery-4.3.0, postgresql-1.4.1 collected 89 items swh/core/api/tests/test_api.py .. [ 2%] swh/core/api/tests/test_async.py .... [ 6%] swh/core/api/tests/test_serializers.py ..... [ 12%] swh/core/db/tests/test_db.py .... [ 16%] swh/core/tests/test_cli.py ...... [ 23%] swh/core/tests/test_config.py .............. [ 39%] swh/core/tests/test_statsd.py ........................................... [ 87%] .... [ 92%] swh/core/tests/test_utils.py ....... [100%] ===================== 89 passed, 9 warnings in 6.94 seconds ===================== ``` Note: this git repository uses [pre-commit][4] hooks to ensure better and more consistent code. It should already be installed in your virtualenv (if not, just type `pip install pre-commit`). Make sure to activate it in your local copy of the git repository: ``` (swh) user@host:~/swh-environment/swh-core$ pre-commit install pre-commit installed at .git/hooks/pre-commit ``` Please read the [developer setup manual][5] for more information on how to hack on Software Heritage. [1]: https://virtualenv.pypa.io [2]: https://docs.pytest.org [3]: https://tox.readthedocs.io [4]: https://pre-commit.com [5]: https://docs.softwareheritage.org/devel/developer-setup.html Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing-core Provides-Extra: logging Provides-Extra: db Provides-Extra: testing-db Provides-Extra: http Provides-Extra: testing diff --git a/mypy.ini b/mypy.ini index 5d30edb..696abf0 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,45 +1,48 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-aiohttp_utils.*] ignore_missing_imports = True [mypy-arrow.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-decorator.*] ignore_missing_imports = True [mypy-deprecated.*] ignore_missing_imports = True [mypy-django.*] # false positive, only used my hypotesis' extras ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-msgpack.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-psycopg2.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True +[mypy-pytest_postgresql.*] +ignore_missing_imports = True + [mypy-requests_mock.*] ignore_missing_imports = True [mypy-systemd.*] ignore_missing_imports = True diff --git a/requirements-db.txt b/requirements-db.txt index 921e04d..d0f0975 100644 --- a/requirements-db.txt +++ b/requirements-db.txt @@ -1,3 +1,4 @@ # requirements for swh.core.db psycopg2 typing-extensions +pytest-postgresql diff --git a/requirements-test-db.txt b/requirements-test-db.txt index cfd42eb..8b13789 100644 --- a/requirements-test-db.txt +++ b/requirements-test-db.txt @@ -1 +1 @@ -pytest-postgresql + diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index 1b7d378..abd445b 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,93 +1,93 @@ Metadata-Version: 2.1 Name: swh.core -Version: 0.4.0 +Version: 0.5.0 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-core/ Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism - database connection - http-based RPC client/server Development ----------- We strongly recommend you to use a [virtualenv][1] if you want to run tests or hack the code. To set up your development environment: ``` (swh) user@host:~/swh-environment/swh-core$ pip install -e .[testing] ``` This will install every Python package needed to run this package's tests. Unit tests can be executed using [pytest][2] or [tox][3]. ``` (swh) user@host:~/swh-environment/swh-core$ pytest ============================== test session starts ============================== platform linux -- Python 3.7.3, pytest-3.10.1, py-1.8.0, pluggy-0.12.0 hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/ddouard/src/swh-environment/swh-core/.hypothesis/examples') rootdir: /home/ddouard/src/swh-environment/swh-core, inifile: pytest.ini plugins: requests-mock-1.6.0, hypothesis-4.26.4, celery-4.3.0, postgresql-1.4.1 collected 89 items swh/core/api/tests/test_api.py .. [ 2%] swh/core/api/tests/test_async.py .... [ 6%] swh/core/api/tests/test_serializers.py ..... [ 12%] swh/core/db/tests/test_db.py .... [ 16%] swh/core/tests/test_cli.py ...... [ 23%] swh/core/tests/test_config.py .............. [ 39%] swh/core/tests/test_statsd.py ........................................... [ 87%] .... [ 92%] swh/core/tests/test_utils.py ....... [100%] ===================== 89 passed, 9 warnings in 6.94 seconds ===================== ``` Note: this git repository uses [pre-commit][4] hooks to ensure better and more consistent code. It should already be installed in your virtualenv (if not, just type `pip install pre-commit`). Make sure to activate it in your local copy of the git repository: ``` (swh) user@host:~/swh-environment/swh-core$ pre-commit install pre-commit installed at .git/hooks/pre-commit ``` Please read the [developer setup manual][5] for more information on how to hack on Software Heritage. [1]: https://virtualenv.pypa.io [2]: https://docs.pytest.org [3]: https://tox.readthedocs.io [4]: https://pre-commit.com [5]: https://docs.softwareheritage.org/devel/developer-setup.html Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing-core Provides-Extra: logging Provides-Extra: db Provides-Extra: testing-db Provides-Extra: http Provides-Extra: testing diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index 4a3e3a0..200cc64 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,102 +1,106 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.md conftest.py mypy.ini pyproject.toml pytest.ini requirements-db.txt requirements-http.txt requirements-logging.txt requirements-swh.txt requirements-test-db.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/entry_points.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api_async.py swh/core/collections.py swh/core/config.py swh/core/logger.py swh/core/py.typed swh/core/pytest_plugin.py swh/core/sentry.py swh/core/statsd.py swh/core/tarball.py swh/core/utils.py swh/core/api/__init__.py swh/core/api/asynchronous.py swh/core/api/classes.py swh/core/api/gunicorn_config.py swh/core/api/negotiation.py swh/core/api/serializers.py swh/core/api/tests/__init__.py swh/core/api/tests/server_testing.py swh/core/api/tests/test_async.py swh/core/api/tests/test_classes.py swh/core/api/tests/test_gunicorn.py swh/core/api/tests/test_rpc_client.py swh/core/api/tests/test_rpc_client_server.py swh/core/api/tests/test_rpc_server.py swh/core/api/tests/test_serializers.py swh/core/cli/__init__.py swh/core/cli/db.py swh/core/db/__init__.py swh/core/db/common.py swh/core/db/db_utils.py +swh/core/db/pytest_plugin.py swh/core/db/tests/__init__.py swh/core/db/tests/conftest.py swh/core/db/tests/db_testing.py swh/core/db/tests/test_cli.py swh/core/db/tests/test_db.py +swh/core/db/tests/test_db_utils.py +swh/core/db/tests/data/0-schema.sql +swh/core/db/tests/data/1-data.sql swh/core/sql/log-schema.sql swh/core/tests/__init__.py swh/core/tests/test_cli.py swh/core/tests/test_collections.py swh/core/tests/test_config.py swh/core/tests/test_logger.py swh/core/tests/test_pytest_plugin.py swh/core/tests/test_statsd.py swh/core/tests/test_tarball.py swh/core/tests/test_utils.py swh/core/tests/data/archives/groff-1.02.tar.Z swh/core/tests/data/archives/hello.tar swh/core/tests/data/archives/hello.tar.bz2 swh/core/tests/data/archives/hello.tar.gz swh/core/tests/data/archives/hello.tar.lz swh/core/tests/data/archives/hello.tar.x swh/core/tests/data/archives/hello.zip swh/core/tests/data/http_example.com/something.json swh/core/tests/data/https_example.com/file.json swh/core/tests/data/https_example.com/file.json,name=doe,firstname=jane swh/core/tests/data/https_example.com/file.json_visit1 swh/core/tests/data/https_example.com/other.json swh/core/tests/data/https_forge.s.o/api_diffusion,attachments[uris]=1 swh/core/tests/data/https_www.reference.com/web,q=What+Is+an+Example+of+a+URL?,qo=contentPageRelatedSearch,o=600605,l=dir,sga=1 swh/core/tests/fixture/__init__.py swh/core/tests/fixture/conftest.py swh/core/tests/fixture/test_pytest_plugin.py swh/core/tests/fixture/data/https_example.com/file.json \ No newline at end of file diff --git a/swh.core.egg-info/requires.txt b/swh.core.egg-info/requires.txt index 2ad0f15..d0b3e47 100644 --- a/swh.core.egg-info/requires.txt +++ b/swh.core.egg-info/requires.txt @@ -1,54 +1,54 @@ Click Deprecated PyYAML sentry-sdk [db] psycopg2 typing-extensions +pytest-postgresql [http] aiohttp aiohttp_utils>=3.1.1 arrow decorator Flask iso8601 msgpack>0.5 requests blinker [logging] systemd-python [testing] pytest pytest-mock requests-mock hypothesis>=3.11.0 pre-commit pytz -pytest-postgresql psycopg2 typing-extensions +pytest-postgresql aiohttp aiohttp_utils>=3.1.1 arrow decorator Flask iso8601 msgpack>0.5 requests blinker systemd-python [testing-core] pytest pytest-mock requests-mock hypothesis>=3.11.0 pre-commit pytz [testing-db] -pytest-postgresql diff --git a/swh/core/api/tests/test_async.py b/swh/core/api/tests/test_async.py index 4b25aa3..8923222 100644 --- a/swh/core/api/tests/test_async.py +++ b/swh/core/api/tests/test_async.py @@ -1,243 +1,233 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import msgpack import pytest from swh.core.api.asynchronous import ( Response, RPCServerApp, decode_request, encode_msgpack, ) from swh.core.api.serializers import SWHJSONEncoder, msgpack_dumps pytest_plugins = ["aiohttp.pytest_plugin", "pytester"] class TestServerException(Exception): pass class TestClientError(Exception): pass async def root(request): return Response("toor") STRUCT = { "txt": "something stupid", # 'date': datetime.date(2019, 6, 9), # not supported "datetime": datetime.datetime(2019, 6, 9, 10, 12, tzinfo=datetime.timezone.utc), "timedelta": datetime.timedelta(days=-2, hours=3), "int": 42, "float": 3.14, "subdata": { "int": 42, "datetime": datetime.datetime( 2019, 6, 10, 11, 12, tzinfo=datetime.timezone.utc ), }, "list": [ 42, datetime.datetime(2019, 9, 10, 11, 12, tzinfo=datetime.timezone.utc), "ok", ], } async def struct(request): return Response(STRUCT) async def echo(request): data = await decode_request(request) return Response(data) async def server_exception(request): raise TestServerException() async def client_error(request): raise TestClientError() async def echo_no_nego(request): # let the content negotiation handle the serialization for us... data = await decode_request(request) ret = encode_msgpack(data) return ret def check_mimetype(src, dst): src = src.split(";")[0].strip() dst = dst.split(";")[0].strip() assert src == dst @pytest.fixture def async_app(): app = RPCServerApp() app.client_exception_classes = (TestClientError,) app.router.add_route("GET", "/", root) app.router.add_route("GET", "/struct", struct) app.router.add_route("POST", "/echo", echo) app.router.add_route("GET", "/server_exception", server_exception) app.router.add_route("GET", "/client_error", client_error) app.router.add_route("POST", "/echo-no-nego", echo_no_nego) return app -async def test_get_simple(async_app, aiohttp_client) -> None: - assert async_app is not None +@pytest.fixture +def cli(async_app, aiohttp_client, loop): + return loop.run_until_complete(aiohttp_client(async_app)) + - cli = await aiohttp_client(async_app) +async def test_get_simple(cli) -> None: resp = await cli.get("/") assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") data = await resp.read() value = msgpack.unpackb(data, raw=False) assert value == "toor" -async def test_get_server_exception(async_app, aiohttp_client) -> None: - cli = await aiohttp_client(async_app) +async def test_get_server_exception(cli) -> None: resp = await cli.get("/server_exception") assert resp.status == 500 data = await resp.read() data = msgpack.unpackb(data, raw=False) assert data["exception"]["type"] == "TestServerException" -async def test_get_client_error(async_app, aiohttp_client) -> None: - cli = await aiohttp_client(async_app) +async def test_get_client_error(cli) -> None: resp = await cli.get("/client_error") assert resp.status == 400 data = await resp.read() data = msgpack.unpackb(data, raw=False) assert data["exception"]["type"] == "TestClientError" -async def test_get_simple_nego(async_app, aiohttp_client) -> None: - cli = await aiohttp_client(async_app) +async def test_get_simple_nego(cli) -> None: for ctype in ("x-msgpack", "json"): resp = await cli.get("/", headers={"Accept": "application/%s" % ctype}) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/%s" % ctype) assert (await decode_request(resp)) == "toor" -async def test_get_struct(async_app, aiohttp_client) -> None: +async def test_get_struct(cli) -> None: """Test returned structured from a simple GET data is OK""" - cli = await aiohttp_client(async_app) resp = await cli.get("/struct") assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") assert (await decode_request(resp)) == STRUCT -async def test_get_struct_nego(async_app, aiohttp_client) -> None: +async def test_get_struct_nego(cli) -> None: """Test returned structured from a simple GET data is OK""" - cli = await aiohttp_client(async_app) for ctype in ("x-msgpack", "json"): resp = await cli.get("/struct", headers={"Accept": "application/%s" % ctype}) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/%s" % ctype) assert (await decode_request(resp)) == STRUCT -async def test_post_struct_msgpack(async_app, aiohttp_client) -> None: +async def test_post_struct_msgpack(cli) -> None: """Test that msgpack encoded posted struct data is returned as is""" - cli = await aiohttp_client(async_app) # simple struct resp = await cli.post( "/echo", headers={"Content-Type": "application/x-msgpack"}, data=msgpack_dumps({"toto": 42}), ) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") assert (await decode_request(resp)) == {"toto": 42} # complex struct resp = await cli.post( "/echo", headers={"Content-Type": "application/x-msgpack"}, data=msgpack_dumps(STRUCT), ) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") assert (await decode_request(resp)) == STRUCT -async def test_post_struct_json(async_app, aiohttp_client) -> None: +async def test_post_struct_json(cli) -> None: """Test that json encoded posted struct data is returned as is""" - cli = await aiohttp_client(async_app) - resp = await cli.post( "/echo", headers={"Content-Type": "application/json"}, data=json.dumps({"toto": 42}, cls=SWHJSONEncoder), ) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") assert (await decode_request(resp)) == {"toto": 42} resp = await cli.post( "/echo", headers={"Content-Type": "application/json"}, data=json.dumps(STRUCT, cls=SWHJSONEncoder), ) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") # assert resp.headers['Content-Type'] == 'application/x-msgpack' assert (await decode_request(resp)) == STRUCT -async def test_post_struct_nego(async_app, aiohttp_client) -> None: +async def test_post_struct_nego(cli) -> None: """Test that json encoded posted struct data is returned as is using content negotiation (accept json or msgpack). """ - cli = await aiohttp_client(async_app) - for ctype in ("x-msgpack", "json"): resp = await cli.post( "/echo", headers={ "Content-Type": "application/json", "Accept": "application/%s" % ctype, }, data=json.dumps(STRUCT, cls=SWHJSONEncoder), ) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/%s" % ctype) assert (await decode_request(resp)) == STRUCT -async def test_post_struct_no_nego(async_app, aiohttp_client) -> None: +async def test_post_struct_no_nego(cli) -> None: """Test that json encoded posted struct data is returned as msgpack when using non-negotiation-compatible handlers. """ - cli = await aiohttp_client(async_app) - for ctype in ("x-msgpack", "json"): resp = await cli.post( "/echo-no-nego", headers={ "Content-Type": "application/json", "Accept": "application/%s" % ctype, }, data=json.dumps(STRUCT, cls=SWHJSONEncoder), ) assert resp.status == 200 check_mimetype(resp.headers["Content-Type"], "application/x-msgpack") assert (await decode_request(resp)) == STRUCT diff --git a/swh/core/config.py b/swh/core/config.py index 7490ef5..119eadc 100644 --- a/swh/core/config.py +++ b/swh/core/config.py @@ -1,378 +1,308 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy from itertools import chain import logging import os from typing import Any, Callable, Dict, List, Optional, Tuple -from deprecated import deprecated import yaml logger = logging.getLogger(__name__) SWH_CONFIG_DIRECTORIES = [ "~/.config/swh", "~/.swh", "/etc/softwareheritage", ] SWH_GLOBAL_CONFIG = "global.yml" SWH_DEFAULT_GLOBAL_CONFIG = { "max_content_size": ("int", 100 * 1024 * 1024), } SWH_CONFIG_EXTENSIONS = [ ".yml", ] # conversion per type _map_convert_fn: Dict[str, Callable] = { "int": int, "bool": lambda x: x.lower() == "true", "list[str]": lambda x: [value.strip() for value in x.split(",")], "list[int]": lambda x: [int(value.strip()) for value in x.split(",")], } _map_check_fn: Dict[str, Callable] = { "int": lambda x: isinstance(x, int), "bool": lambda x: isinstance(x, bool), "list[str]": lambda x: (isinstance(x, list) and all(isinstance(y, str) for y in x)), "list[int]": lambda x: (isinstance(x, list) and all(isinstance(y, int) for y in x)), } def exists_accessible(filepath: str) -> bool: """Check whether a file exists, and is accessible. Returns: True if the file exists and is accessible False if the file does not exist Raises: PermissionError if the file cannot be read. """ try: os.stat(filepath) except PermissionError: raise except FileNotFoundError: return False else: if os.access(filepath, os.R_OK): return True else: raise PermissionError("Permission denied: {filepath!r}") def config_basepath(config_path: str) -> str: """Return the base path of a configuration file""" if config_path.endswith(".yml"): return config_path[:-4] return config_path def read_raw_config(base_config_path: str) -> Dict[str, Any]: """Read the raw config corresponding to base_config_path. Can read yml files. """ yml_file = f"{base_config_path}.yml" if exists_accessible(yml_file): logger.debug("Loading config file %s", yml_file) with open(yml_file) as f: return yaml.safe_load(f) return {} def config_exists(config_path): """Check whether the given config exists""" basepath = config_basepath(config_path) return any( exists_accessible(basepath + extension) for extension in SWH_CONFIG_EXTENSIONS ) def read( conf_file: Optional[str] = None, default_conf: Optional[Dict[str, Tuple[str, Any]]] = None, ) -> Dict[str, Any]: """Read the user's configuration file. Fill in the gap using `default_conf`. `default_conf` is similar to this:: DEFAULT_CONF = { 'a': ('str', '/tmp/swh-loader-git/log'), 'b': ('str', 'dbname=swhloadergit') 'c': ('bool', true) 'e': ('bool', None) 'd': ('int', 10) } If conf_file is None, return the default config. """ conf: Dict[str, Any] = {} if conf_file: base_config_path = config_basepath(os.path.expanduser(conf_file)) conf = read_raw_config(base_config_path) or {} if not default_conf: return conf # remaining missing default configuration key are set # also type conversion is enforced for underneath layer for key, (nature_type, default_value) in default_conf.items(): val = conf.get(key, None) if val is None: # fallback to default value conf[key] = default_value elif not _map_check_fn.get(nature_type, lambda x: True)(val): # value present but not in the proper format, force type conversion conf[key] = _map_convert_fn.get(nature_type, lambda x: x)(val) return conf def priority_read( conf_filenames: List[str], default_conf: Optional[Dict[str, Tuple[str, Any]]] = None ): """Try reading the configuration files from conf_filenames, in order, and return the configuration from the first one that exists. default_conf has the same specification as it does in read. """ # Try all the files in order for filename in conf_filenames: full_filename = os.path.expanduser(filename) if config_exists(full_filename): return read(full_filename, default_conf) # Else, return the default configuration return read(None, default_conf) def merge_default_configs(base_config, *other_configs): """Merge several default config dictionaries, from left to right""" full_config = base_config.copy() for config in other_configs: full_config.update(config) return full_config def merge_configs(base: Optional[Dict[str, Any]], other: Optional[Dict[str, Any]]): """Merge two config dictionaries This does merge config dicts recursively, with the rules, for every value of the dicts (with 'val' not being a dict): - None + type -> type - type + None -> None - dict + dict -> dict (merged) - val + dict -> TypeError - dict + val -> TypeError - val + val -> val (other) for instance: >>> d1 = { ... 'key1': { ... 'skey1': 'value1', ... 'skey2': {'sskey1': 'value2'}, ... }, ... 'key2': 'value3', ... } with >>> d2 = { ... 'key1': { ... 'skey1': 'value4', ... 'skey2': {'sskey2': 'value5'}, ... }, ... 'key3': 'value6', ... } will give: >>> d3 = { ... 'key1': { ... 'skey1': 'value4', # <-- note this ... 'skey2': { ... 'sskey1': 'value2', ... 'sskey2': 'value5', ... }, ... }, ... 'key2': 'value3', ... 'key3': 'value6', ... } >>> assert merge_configs(d1, d2) == d3 Note that no type checking is done for anything but dicts. """ if not isinstance(base, dict) or not isinstance(other, dict): raise TypeError("Cannot merge a %s with a %s" % (type(base), type(other))) output = {} allkeys = set(chain(base.keys(), other.keys())) for k in allkeys: vb = base.get(k) vo = other.get(k) if isinstance(vo, dict): output[k] = merge_configs(vb is not None and vb or {}, vo) elif isinstance(vb, dict) and k in other and other[k] is not None: output[k] = merge_configs(vb, vo is not None and vo or {}) elif k in other: output[k] = deepcopy(vo) else: output[k] = deepcopy(vb) return output def swh_config_paths(base_filename: str) -> List[str]: """Return the Software Heritage specific configuration paths for the given filename.""" return [os.path.join(dirname, base_filename) for dirname in SWH_CONFIG_DIRECTORIES] def prepare_folders(conf, *keys): """Prepare the folder mentioned in config under keys. """ def makedir(folder): if not os.path.exists(folder): os.makedirs(folder) for key in keys: makedir(conf[key]) def load_global_config(): """Load the global Software Heritage config""" return priority_read( swh_config_paths(SWH_GLOBAL_CONFIG), SWH_DEFAULT_GLOBAL_CONFIG, ) def load_named_config(name, default_conf=None, global_conf=True): """Load the config named `name` from the Software Heritage configuration paths. If global_conf is True (default), read the global configuration too. """ conf = {} if global_conf: conf.update(load_global_config()) conf.update(priority_read(swh_config_paths(name), default_conf)) return conf def load_from_envvar(default_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Load configuration yaml file from the environment variable SWH_CONFIG_FILENAME, eventually enriched with default configuration key/value from the default_config dict if provided. Returns: Configuration dict Raises: AssertionError if SWH_CONFIG_FILENAME is undefined """ assert ( "SWH_CONFIG_FILENAME" in os.environ ), "SWH_CONFIG_FILENAME environment variable is undefined." cfg_path = os.environ["SWH_CONFIG_FILENAME"] cfg = read_raw_config(config_basepath(cfg_path)) cfg = merge_configs(default_config or {}, cfg) return cfg - - -@deprecated(version="0.3.2", reason="Use swh.core.config.load_from_envvar instead") -class SWHConfig: - """Mixin to add configuration parsing abilities to classes - - The class should override the class attributes: - - DEFAULT_CONFIG (default configuration to be parsed) - - CONFIG_BASE_FILENAME (the filename of the configuration to be used) - - This class defines one classmethod, parse_config_file, which - parses a configuration file using the default config as set in the - class attribute. - - """ - - DEFAULT_CONFIG = {} # type: Dict[str, Tuple[str, Any]] - CONFIG_BASE_FILENAME = "" # type: Optional[str] - - @classmethod - def parse_config_file( - cls, - base_filename=None, - config_filename=None, - additional_configs=None, - global_config=True, - ): - """Parse the configuration file associated to the current class. - - By default, parse_config_file will load the configuration - cls.CONFIG_BASE_FILENAME from one of the Software Heritage - configuration directories, in order, unless it is overridden - by base_filename or config_filename (which shortcuts the file - lookup completely). - - Args: - - base_filename (str): overrides the default - cls.CONFIG_BASE_FILENAME - - config_filename (str): sets the file to parse instead of - the defaults set from cls.CONFIG_BASE_FILENAME - - additional_configs: (list of default configuration dicts) - allows to override or extend the configuration set in - cls.DEFAULT_CONFIG. - - global_config (bool): Load the global configuration (default: - True) - """ - - if config_filename: - config_filenames = [config_filename] - elif "SWH_CONFIG_FILENAME" in os.environ: - config_filenames = [os.environ["SWH_CONFIG_FILENAME"]] - else: - if not base_filename: - base_filename = cls.CONFIG_BASE_FILENAME - config_filenames = swh_config_paths(base_filename) - if not additional_configs: - additional_configs = [] - - full_default_config = merge_default_configs( - cls.DEFAULT_CONFIG, *additional_configs - ) - - config = {} - if global_config: - config = load_global_config() - - config.update(priority_read(config_filenames, full_default_config)) - - return config diff --git a/swh/core/db/db_utils.py b/swh/core/db/db_utils.py index 77ddb59..95b79ff 100644 --- a/swh/core/db/db_utils.py +++ b/swh/core/db/db_utils.py @@ -1,252 +1,366 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools +import glob import logging import re -from typing import Optional, Union +import subprocess +from typing import Optional, Set, Union import psycopg2 import psycopg2.extensions +from pytest_postgresql.janitor import DatabaseJanitor, Version + +from swh.core.utils import numfile_sortkey as sortkey logger = logging.getLogger(__name__) def stored_procedure(stored_proc): """decorator to execute remote stored procedure, specified as argument Generally, the body of the decorated function should be empty. If it is not, the stored procedure will be executed first; the function body then. """ def wrap(meth): @functools.wraps(meth) def _meth(self, *args, **kwargs): cur = kwargs.get("cur", None) self._cursor(cur).execute("SELECT %s()" % stored_proc) meth(self, *args, **kwargs) return _meth return wrap def jsonize(value): """Convert a value to a psycopg2 JSON object if necessary""" if isinstance(value, dict): return psycopg2.extras.Json(value) return value def connect_to_conninfo( db_or_conninfo: Union[str, psycopg2.extensions.connection] ) -> psycopg2.extensions.connection: """Connect to the database passed in argument Args: db_or_conninfo: A database connection, or a database connection info string Returns: a connected database handle Raises: psycopg2.Error if the database doesn't exist """ if isinstance(db_or_conninfo, psycopg2.extensions.connection): return db_or_conninfo if "=" not in db_or_conninfo and "//" not in db_or_conninfo: # Database name db_or_conninfo = f"dbname={db_or_conninfo}" db = psycopg2.connect(db_or_conninfo) return db def swh_db_version( db_or_conninfo: Union[str, psycopg2.extensions.connection] ) -> Optional[int]: """Retrieve the swh version of the database. If the database is not initialized, this logs a warning and returns None. Args: db_or_conninfo: A database connection, or a database connection info string Returns: Either the version of the database, or None if it couldn't be detected """ try: db = connect_to_conninfo(db_or_conninfo) except psycopg2.Error: logger.exception("Failed to connect to `%s`", db_or_conninfo) # Database not initialized return None try: with db.cursor() as c: query = "select version from dbversion order by dbversion desc limit 1" try: c.execute(query) return c.fetchone()[0] except psycopg2.errors.UndefinedTable: return None except Exception: logger.exception("Could not get version from `%s`", db_or_conninfo) return None def swh_db_flavor( db_or_conninfo: Union[str, psycopg2.extensions.connection] ) -> Optional[str]: """Retrieve the swh flavor of the database. If the database is not initialized, or the database doesn't support flavors, this returns None. Args: db_or_conninfo: A database connection, or a database connection info string Returns: The flavor of the database, or None if it could not be detected. """ try: db = connect_to_conninfo(db_or_conninfo) except psycopg2.Error: logger.exception("Failed to connect to `%s`", db_or_conninfo) # Database not initialized return None try: with db.cursor() as c: query = "select swh_get_dbflavor()" try: c.execute(query) return c.fetchone()[0] except psycopg2.errors.UndefinedFunction: # function not found: no flavor return None except Exception: logger.exception("Could not get flavor from `%s`", db_or_conninfo) return None # The following code has been imported from psycopg2, version 2.7.4, # https://github.com/psycopg/psycopg2/tree/5afb2ce803debea9533e293eef73c92ffce95bcd # and modified by Software Heritage. # # Original file: lib/extras.py # # psycopg2 is free software: you can redistribute it and/or modify it under the # terms of the GNU Lesser General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. def _paginate(seq, page_size): """Consume an iterable and return it in chunks. Every chunk is at most `page_size`. Never return an empty chunk. """ page = [] it = iter(seq) while 1: try: for i in range(page_size): page.append(next(it)) yield page page = [] except StopIteration: if page: yield page return def _split_sql(sql): """Split *sql* on a single ``%s`` placeholder. Split on the %s, perform %% replacement and return pre, post lists of snippets. """ curr = pre = [] post = [] tokens = re.split(br"(%.)", sql) for token in tokens: if len(token) != 2 or token[:1] != b"%": curr.append(token) continue if token[1:] == b"s": if curr is pre: curr = post else: raise ValueError("the query contains more than one '%s' placeholder") elif token[1:] == b"%": curr.append(b"%") else: raise ValueError( "unsupported format character: '%s'" % token[1:].decode("ascii", "replace") ) if curr is pre: raise ValueError("the query doesn't contain any '%s' placeholder") return pre, post def execute_values_generator(cur, sql, argslist, template=None, page_size=100): """Execute a statement using SQL ``VALUES`` with a sequence of parameters. Rows returned by the query are returned through a generator. You need to consume the generator for the queries to be executed! :param cur: the cursor to use to execute the query. :param sql: the query to execute. It must contain a single ``%s`` placeholder, which will be replaced by a `VALUES list`__. Example: ``"INSERT INTO mytable (id, f1, f2) VALUES %s"``. :param argslist: sequence of sequences or dictionaries with the arguments to send to the query. The type and content must be consistent with *template*. :param template: the snippet to merge to every item in *argslist* to compose the query. - If the *argslist* items are sequences it should contain positional placeholders (e.g. ``"(%s, %s, %s)"``, or ``"(%s, %s, 42)``" if there are constants value...). - If the *argslist* items are mappings it should contain named placeholders (e.g. ``"(%(id)s, %(f1)s, 42)"``). If not specified, assume the arguments are sequence and use a simple positional template (i.e. ``(%s, %s, ...)``), with the number of placeholders sniffed by the first element in *argslist*. :param page_size: maximum number of *argslist* items to include in every statement. If there are more items the function will execute more than one statement. :param yield_from_cur: Whether to yield results from the cursor in this function directly. .. __: https://www.postgresql.org/docs/current/static/queries-values.html After the execution of the function the `cursor.rowcount` property will **not** contain a total result. """ # we can't just use sql % vals because vals is bytes: if sql is bytes # there will be some decoding error because of stupid codec used, and Py3 # doesn't implement % on bytes. if not isinstance(sql, bytes): sql = sql.encode(psycopg2.extensions.encodings[cur.connection.encoding]) pre, post = _split_sql(sql) for page in _paginate(argslist, page_size=page_size): if template is None: template = b"(" + b",".join([b"%s"] * len(page[0])) + b")" parts = pre[:] for args in page: parts.append(cur.mogrify(template, args)) parts.append(b",") parts[-1:] = post cur.execute(b"".join(parts)) yield from cur + + +class SWHDatabaseJanitor(DatabaseJanitor): + """SWH database janitor implementation with a a different setup/teardown policy than + than the stock one. Instead of dropping, creating and initializing the database for + each test, it creates and initializes the db once, then truncates the tables (and + sequences) in between tests. + + This is needed to have acceptable test performances. + + """ + + def __init__( + self, + user: str, + host: str, + port: str, + db_name: str, + version: Union[str, float, Version], + dump_files: Optional[str] = None, + no_truncate_tables: Set[str] = set(), + ) -> None: + super().__init__(user, host, port, db_name, version) + if dump_files: + self.dump_files = sorted(glob.glob(dump_files), key=sortkey) + else: + self.dump_files = [] + # do no truncate the following tables + self.no_truncate_tables = set(no_truncate_tables) + + def db_setup(self): + conninfo = ( + f"host={self.host} user={self.user} port={self.port} dbname={self.db_name}" + ) + + for fname in self.dump_files: + subprocess.check_call( + [ + "psql", + "--quiet", + "--no-psqlrc", + "-v", + "ON_ERROR_STOP=1", + "-d", + conninfo, + "-f", + fname, + ] + ) + + def db_reset(self): + """Truncate tables (all but self.no_truncate_tables set) and sequences + + """ + with psycopg2.connect( + dbname=self.db_name, user=self.user, host=self.host, port=self.port, + ) as cnx: + with cnx.cursor() as cur: + cur.execute( + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = %s", + ("public",), + ) + all_tables = set(table for (table,) in cur.fetchall()) + tables_to_truncate = all_tables - self.no_truncate_tables + + for table in tables_to_truncate: + cur.execute("TRUNCATE TABLE %s CASCADE" % table) + + cur.execute( + "SELECT sequence_name FROM information_schema.sequences " + "WHERE sequence_schema = %s", + ("public",), + ) + seqs = set(seq for (seq,) in cur.fetchall()) + for seq in seqs: + cur.execute("ALTER SEQUENCE %s RESTART;" % seq) + cnx.commit() + + def init(self): + """Initialize db. Create the db if it does not exist. Reset it if it exists.""" + with self.cursor() as cur: + cur.execute( + "SELECT COUNT(1) FROM pg_database WHERE datname=%s;", (self.db_name,) + ) + db_exists = cur.fetchone()[0] == 1 + if db_exists: + cur.execute( + "UPDATE pg_database SET datallowconn=true WHERE datname = %s;", + (self.db_name,), + ) + self.db_reset() + return + + # initialize the inexistent db + with self.cursor() as cur: + cur.execute('CREATE DATABASE "{}";'.format(self.db_name)) + self.db_setup() + + def drop(self): + """The original DatabaseJanitor implementation prevents new connections from happening, + destroys current opened connections and finally drops the database. + + We actually do not want to drop the db so we instead do nothing and resets + (truncate most tables and sequences) the db instead, in order to have some + acceptable performance. + + """ + pass diff --git a/swh/core/db/pytest_plugin.py b/swh/core/db/pytest_plugin.py new file mode 100644 index 0000000..56e3305 --- /dev/null +++ b/swh/core/db/pytest_plugin.py @@ -0,0 +1,62 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from typing import Optional, Set + +import psycopg2 +import pytest +from pytest_postgresql import factories + +from swh.core.db.db_utils import SWHDatabaseJanitor + +logger = logging.getLogger(__name__) + + +# the postgres_fact factory fixture below is mostly a copy of the code +# from pytest-postgresql. We need a custom version here to be able to +# specify our version of the DBJanitor we use. +def postgresql_fact( + process_fixture_name: str, + db_name: Optional[str] = None, + dump_files: str = "", + no_truncate_tables: Set[str] = {"dbversion"}, +): + @pytest.fixture + def postgresql_factory(request): + """Fixture factory for PostgreSQL. + + :param FixtureRequest request: fixture request object + :rtype: psycopg2.connection + :returns: postgresql client + """ + config = factories.get_config(request) + proc_fixture = request.getfixturevalue(process_fixture_name) + + pg_host = proc_fixture.host + pg_port = proc_fixture.port + pg_user = proc_fixture.user + pg_options = proc_fixture.options + pg_db = db_name or config["dbname"] + with SWHDatabaseJanitor( + pg_user, + pg_host, + pg_port, + pg_db, + proc_fixture.version, + dump_files=dump_files, + no_truncate_tables=no_truncate_tables, + ): + connection = psycopg2.connect( + dbname=pg_db, + user=pg_user, + host=pg_host, + port=pg_port, + options=pg_options, + ) + yield connection + connection.close() + + return postgresql_factory diff --git a/swh/core/db/tests/data/0-schema.sql b/swh/core/db/tests/data/0-schema.sql new file mode 100644 index 0000000..e6b008b --- /dev/null +++ b/swh/core/db/tests/data/0-schema.sql @@ -0,0 +1,19 @@ +-- schema version table which won't get truncated +create table dbversion ( + version int primary key, + release timestamptz, + description text +); + +-- a people table which won't get truncated +create table people ( + fullname text not null +); + +-- a fun table which will get truncated for each test +create table fun ( + time timestamptz not null +); + +-- one sequence to check for reset as well +create sequence serial; diff --git a/swh/core/db/tests/data/1-data.sql b/swh/core/db/tests/data/1-data.sql new file mode 100644 index 0000000..8680263 --- /dev/null +++ b/swh/core/db/tests/data/1-data.sql @@ -0,0 +1,15 @@ +-- insert some values in dbversion +insert into dbversion(version, release, description) values (1, '2016-02-22 15:56:28.358587+00', 'Work In Progress'); +insert into dbversion(version, release, description) values (2, '2016-02-24 18:05:54.887217+00', 'Work In Progress'); +insert into dbversion(version, release, description) values (3, '2016-10-21 14:10:18.629763+00', 'Work In Progress'); +insert into dbversion(version, release, description) values (4, '2017-08-08 19:01:11.723113+00', 'Work In Progress'); +insert into dbversion(version, release, description) values (7, '2018-03-30 12:58:39.256679+00', 'Work In Progress'); + +insert into fun(time) values ('2020-10-19 09:00:00.666999+00'); +insert into fun(time) values ('2020-10-18 09:00:00.666999+00'); +insert into fun(time) values ('2020-10-17 09:00:00.666999+00'); + +select nextval('serial'); + +insert into people(fullname) values ('dudess'); +insert into people(fullname) values ('dude'); diff --git a/swh/core/db/tests/test_db_utils.py b/swh/core/db/tests/test_db_utils.py new file mode 100644 index 0000000..d4cb0fa --- /dev/null +++ b/swh/core/db/tests/test_db_utils.py @@ -0,0 +1,142 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + +from swh.core.db import BaseDb +from swh.core.db.pytest_plugin import postgresql_fact + +SQL_DIR = os.path.join(os.path.dirname(__file__), "data") + + +# db with special policy for tables dbversion and people +postgres_fun = postgresql_fact( + "postgresql_proc", + db_name="fun", + dump_files=f"{SQL_DIR}/*.sql", + no_truncate_tables={"dbversion", "people"}, +) + + +def test_smoke_test_fun_db_is_up(postgres_fun): + """This ensures the db is created and configured according to its dumps files. + + """ + with BaseDb.connect(postgres_fun.dsn).cursor() as cur: + cur.execute("select count(*) from dbversion") + nb_rows = cur.fetchone()[0] + assert nb_rows == 5 + + cur.execute("select count(*) from fun") + nb_rows = cur.fetchone()[0] + assert nb_rows == 3 + + cur.execute("select count(*) from people") + nb_rows = cur.fetchone()[0] + assert nb_rows == 2 + + # in data, we requested a value already so it starts at 2 + cur.execute("select nextval('serial')") + val = cur.fetchone()[0] + assert val == 2 + + +def test_smoke_test_fun_db_is_still_up_and_got_reset(postgres_fun): + """This ensures that within another tests, the 'fun' db is still up, created (and not + configured again). This time, most of the data has been reset: + - except for tables 'dbversion' and 'people' which were left as is + - the other tables from the schema (here only "fun") got truncated + - the sequences got truncated as well + + """ + with BaseDb.connect(postgres_fun.dsn).cursor() as cur: + # db version is excluded from the truncate + cur.execute("select count(*) from dbversion") + nb_rows = cur.fetchone()[0] + assert nb_rows == 5 + + # people is also allowed not to be truncated + cur.execute("select count(*) from people") + nb_rows = cur.fetchone()[0] + assert nb_rows == 2 + + # table and sequence are reset + cur.execute("select count(*) from fun") + nb_rows = cur.fetchone()[0] + assert nb_rows == 0 + + cur.execute("select nextval('serial')") + val = cur.fetchone()[0] + assert val == 1 + + +# db with no special policy for tables truncation, all tables are reset +postgres_people = postgresql_fact( + "postgresql_proc", + db_name="people", + dump_files=f"{SQL_DIR}/*.sql", + no_truncate_tables=set(), +) + + +def test_smoke_test_people_db_up(postgres_people): + """'people' db is up and configured + + """ + with BaseDb.connect(postgres_people.dsn).cursor() as cur: + cur.execute("select count(*) from dbversion") + nb_rows = cur.fetchone()[0] + assert nb_rows == 5 + + cur.execute("select count(*) from people") + nb_rows = cur.fetchone()[0] + assert nb_rows == 2 + + cur.execute("select count(*) from fun") + nb_rows = cur.fetchone()[0] + assert nb_rows == 3 + + cur.execute("select nextval('serial')") + val = cur.fetchone()[0] + assert val == 2 + + +def test_smoke_test_people_db_up_and_reset(postgres_people): + """'people' db is up and got reset on every tables and sequences + + """ + with BaseDb.connect(postgres_people.dsn).cursor() as cur: + # tables are truncated after the first round + cur.execute("select count(*) from dbversion") + nb_rows = cur.fetchone()[0] + assert nb_rows == 0 + + # tables are truncated after the first round + cur.execute("select count(*) from people") + nb_rows = cur.fetchone()[0] + assert nb_rows == 0 + + # table and sequence are reset + cur.execute("select count(*) from fun") + nb_rows = cur.fetchone()[0] + assert nb_rows == 0 + + cur.execute("select nextval('serial')") + val = cur.fetchone()[0] + assert val == 1 + + +# db with no initialization step, an empty db +postgres_no_init = postgresql_fact("postgresql_proc", db_name="something") + + +def test_smoke_test_db_no_init(postgres_no_init): + """We can connect to the db nonetheless + + """ + with BaseDb.connect(postgres_no_init.dsn).cursor() as cur: + cur.execute("select now()") + data = cur.fetchone()[0] + assert data is not None