diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -3,3 +3,34 @@ Provenance DB module to query the provenance of source code artifacts present in the Software Heritage archive. + +This project allows to build such a provenance db from the Software Heritage +Archive, and query this database. + +## Building a provenance database + +Building the provenance database requires a read access to the Software +Heritage archive, either via a direct access to the database (preferred for +better performances), or using the RPC API to a Software Heritage Storage +instance. + +It also need a postgresql database in which the provenance db will be written +into. + +A configuration file is needed with with the access to both these databases: + +``` +archive: + cls: api + storage: + cls: remote + url: http://uffizi.internal.softwareheritage.org:5002 + +provenance: + cls: ps + db: + dbname: provenance + host: localhost + + +``` diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -10,7 +10,7 @@ def get_archive(cls: str, **kwargs) -> ArchiveInterface: if cls == "api": return ArchiveStorage(**kwargs["storage"]) - elif cls == "ps": + elif cls == "direct": conn = connect(kwargs["db"]) return ArchivePostgreSQL(conn) else: @@ -18,11 +18,11 @@ def get_provenance(cls: str, **kwargs) -> ProvenanceInterface: - if cls == "ps": + if cls == "local": conn = connect(kwargs["db"]) - return ProvenancePostgreSQL(conn) - elif cls == "ps_np": - conn = connect(kwargs["db"]) - return ProvenancePostgreSQLNoPath(conn) + if kwargs.get("with_path", True): + return ProvenancePostgreSQL(conn) + else: + return ProvenancePostgreSQLNoPath(conn) else: raise NotImplementedError diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -10,6 +10,7 @@ import click import yaml +from psycopg2.extensions import parse_dsn from swh.core import config from swh.core.cli import CONTEXT_SETTINGS @@ -106,26 +107,35 @@ @cli.command(name="create") -@click.option("--name", default=None) +@click.option("--maintenance-db", default=None) +@click.option("--drop/--no-drop", "drop_db", default=False) @click.pass_context -def create(ctx, name): +def create(ctx, maintenance_db, drop_db): """Create new provenance database.""" from .postgresql.db_utils import connect + if ctx.obj["config"]["provenance"]["cls"] != "local": + raise ValueError( + "Unsupported provenance db cls: %s" + % (ctx.obj["config"]["provenance"]["cls"]) + ) + # Connect to server without selecting a database - conninfo = ctx.obj["config"]["provenance"]["db"] - conn = connect(conninfo) + dsn = ctx.obj["config"]["provenance"]["db"] + if isinstance(dsn, str): + dsn = parse_dsn(dsn) + dbname = dsn.pop("dbname") + if maintenance_db: + dsn["dbname"] = maintenance_db - if ctx.obj["config"]["provenance"]["cls"] == "ps": - from .postgresql.provenance import create_database + conn = connect(dsn) - create_database(conn, conninfo, name) - elif ctx.obj["config"]["provenance"]["cls"] == "ps_np": + if ctx.obj["config"]["provenance"].get("with_path"): + from .postgresql.provenance import create_database + else: from .postgresql_nopath.provenance import create_database - create_database(conn, conninfo, name) - else: - raise NotImplementedError + create_database(conn, dbname, drop_db) @cli.command(name="iter-revisions") diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -19,7 +19,9 @@ return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path -def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str): +def create_database( + conn: psycopg2.extensions.connection, name: str, drop_db: bool = True +): conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) # Normalize dbname to avoid issues when reconnecting below @@ -27,11 +29,13 @@ # Create new database dropping previous one if exists cursor = conn.cursor() - cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") + if drop_db: + cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") cursor.execute(f"""CREATE DATABASE {name}""") conn.close() # Reconnect to server selecting newly created database to add tables + conninfo = psycopg2.extensions.parse_dsn(conn.dsn) conninfo["dbname"] = name conn = connect(conninfo) diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py --- a/swh/provenance/postgresql_nopath/provenance.py +++ b/swh/provenance/postgresql_nopath/provenance.py @@ -15,7 +15,9 @@ from ..revision import RevisionEntry -def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str): +def create_database( + conn: psycopg2.extensions.connection, name: str, drop_db: bool = True +): conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) # Normalize dbname to avoid issues when reconnecting below @@ -23,11 +25,13 @@ # Create new database dropping previous one if exists cursor = conn.cursor() - cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") + if drop_db: + cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") cursor.execute(f"""CREATE DATABASE {name}""") conn.close() # Reconnect to server selecting newly created database to add tables + conninfo = psycopg2.extensions.parse_dsn(conn.dsn) conninfo["dbname"] = name conn = connect(conninfo) diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_cli.py @@ -0,0 +1,54 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest +import yaml +from click.testing import CliRunner +from psycopg2.extensions import parse_dsn + +import swh.provenance.cli # noqa ; ensure cli is loaded +from swh.core.cli import swh as swhmain +from swh.core.db.pytest_plugin import postgresql_fact + +pytest_plugins = ["swh.storage.pytest_plugin"] + +provenance_db = postgresql_fact("postgresql_proc", db_name="provenance",) + + +def test_cli_swh_db_help(): + # swhmain.add_command(provenance_cli) + result = CliRunner().invoke(swhmain, ["provenance", "-h"]) + assert result.exit_code == 0 + assert "Commands:" in result.output + commands = result.output.split("Commands:")[1] + for command in ( + "create", + "find-all", + "find-first", + "iter-origins", + "iter-revisions", + ): + assert f" {command} " in commands + + +@pytest.mark.parametrize("with_path", (True, False)) +def test_cli_create(provenance_db, tmp_path, with_path): + conffile = tmp_path / "config.yml" + dsn = parse_dsn(provenance_db.dsn) + dsn["dbname"] = "test_provenance" + conf = { + "provenance": {"cls": "local", "with_path": with_path, "db": dsn,}, + } + yaml.dump(conf, conffile.open("w")) + result = CliRunner().invoke( + swhmain, ["provenance", "--config-file", str(conffile), "create", "--drop"] + ) + assert result.exit_code == 0, result.output + + # this will fail because the db already exists + result = CliRunner().invoke( + swhmain, ["provenance", "--config-file", str(conffile), "create"] + ) + assert result.exit_code == 1, result.output