Page MenuHomeSoftware Heritage

D5072.id18277.diff
No OneTemporary

D5072.id18277.diff

diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -3,3 +3,34 @@
Provenance DB module to query the provenance of source code artifacts present
in the Software Heritage archive.
+
+This project allows to build such a provenance db from the Software Heritage
+Archive, and query this database.
+
+## Building a provenance database
+
+Building the provenance database requires a read access to the Software
+Heritage archive, either via a direct access to the database (preferred for
+better performances), or using the RPC API to a Software Heritage Storage
+instance.
+
+It also need a postgresql database in which the provenance db will be written
+into.
+
+A configuration file is needed with with the access to both these databases:
+
+```
+archive:
+ cls: api
+ storage:
+ cls: remote
+ url: http://uffizi.internal.softwareheritage.org:5002
+
+provenance:
+ cls: ps
+ db:
+ dbname: provenance
+ host: localhost
+
+
+```
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -10,7 +10,7 @@
def get_archive(cls: str, **kwargs) -> ArchiveInterface:
if cls == "api":
return ArchiveStorage(**kwargs["storage"])
- elif cls == "ps":
+ elif cls == "direct":
conn = connect(kwargs["db"])
return ArchivePostgreSQL(conn)
else:
@@ -18,11 +18,11 @@
def get_provenance(cls: str, **kwargs) -> ProvenanceInterface:
- if cls == "ps":
+ if cls == "local":
conn = connect(kwargs["db"])
- return ProvenancePostgreSQL(conn)
- elif cls == "ps_np":
- conn = connect(kwargs["db"])
- return ProvenancePostgreSQLNoPath(conn)
+ if kwargs.get("with_path", True):
+ return ProvenancePostgreSQL(conn)
+ else:
+ return ProvenancePostgreSQLNoPath(conn)
else:
raise NotImplementedError
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -10,6 +10,7 @@
import click
import yaml
+from psycopg2.extensions import parse_dsn
from swh.core import config
from swh.core.cli import CONTEXT_SETTINGS
@@ -106,26 +107,35 @@
@cli.command(name="create")
-@click.option("--name", default=None)
+@click.option("--maintenance-db", default=None)
+@click.option("--drop/--no-drop", "drop_db", default=False)
@click.pass_context
-def create(ctx, name):
+def create(ctx, maintenance_db, drop_db):
"""Create new provenance database."""
from .postgresql.db_utils import connect
+ if ctx.obj["config"]["provenance"]["cls"] != "local":
+ raise ValueError(
+ "Unsupported provenance db cls: %s"
+ % (ctx.obj["config"]["provenance"]["cls"])
+ )
+
# Connect to server without selecting a database
- conninfo = ctx.obj["config"]["provenance"]["db"]
- conn = connect(conninfo)
+ dsn = ctx.obj["config"]["provenance"]["db"]
+ if isinstance(dsn, str):
+ dsn = parse_dsn(dsn)
+ dbname = dsn.pop("dbname")
+ if maintenance_db:
+ dsn["dbname"] = maintenance_db
- if ctx.obj["config"]["provenance"]["cls"] == "ps":
- from .postgresql.provenance import create_database
+ conn = connect(dsn)
- create_database(conn, conninfo, name)
- elif ctx.obj["config"]["provenance"]["cls"] == "ps_np":
+ if ctx.obj["config"]["provenance"].get("with_path"):
+ from .postgresql.provenance import create_database
+ else:
from .postgresql_nopath.provenance import create_database
- create_database(conn, conninfo, name)
- else:
- raise NotImplementedError
+ create_database(conn, dbname, drop_db)
@cli.command(name="iter-revisions")
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -19,7 +19,9 @@
return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path
-def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
+def create_database(
+ conn: psycopg2.extensions.connection, name: str, drop_db: bool = True
+):
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
# Normalize dbname to avoid issues when reconnecting below
@@ -27,11 +29,13 @@
# Create new database dropping previous one if exists
cursor = conn.cursor()
- cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
+ if drop_db:
+ cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
cursor.execute(f"""CREATE DATABASE {name}""")
conn.close()
# Reconnect to server selecting newly created database to add tables
+ conninfo = psycopg2.extensions.parse_dsn(conn.dsn)
conninfo["dbname"] = name
conn = connect(conninfo)
diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py
--- a/swh/provenance/postgresql_nopath/provenance.py
+++ b/swh/provenance/postgresql_nopath/provenance.py
@@ -15,7 +15,9 @@
from ..revision import RevisionEntry
-def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
+def create_database(
+ conn: psycopg2.extensions.connection, name: str, drop_db: bool = True
+):
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
# Normalize dbname to avoid issues when reconnecting below
@@ -23,11 +25,13 @@
# Create new database dropping previous one if exists
cursor = conn.cursor()
- cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
+ if drop_db:
+ cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
cursor.execute(f"""CREATE DATABASE {name}""")
conn.close()
# Reconnect to server selecting newly created database to add tables
+ conninfo = psycopg2.extensions.parse_dsn(conn.dsn)
conninfo["dbname"] = name
conn = connect(conninfo)
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_cli.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+import yaml
+from click.testing import CliRunner
+from psycopg2.extensions import parse_dsn
+
+import swh.provenance.cli # noqa ; ensure cli is loaded
+from swh.core.cli import swh as swhmain
+from swh.core.db.pytest_plugin import postgresql_fact
+
+pytest_plugins = ["swh.storage.pytest_plugin"]
+
+provenance_db = postgresql_fact("postgresql_proc", db_name="provenance",)
+
+
+def test_cli_swh_db_help():
+ # swhmain.add_command(provenance_cli)
+ result = CliRunner().invoke(swhmain, ["provenance", "-h"])
+ assert result.exit_code == 0
+ assert "Commands:" in result.output
+ commands = result.output.split("Commands:")[1]
+ for command in (
+ "create",
+ "find-all",
+ "find-first",
+ "iter-origins",
+ "iter-revisions",
+ ):
+ assert f" {command} " in commands
+
+
+@pytest.mark.parametrize("with_path", (True, False))
+def test_cli_create(provenance_db, tmp_path, with_path):
+ conffile = tmp_path / "config.yml"
+ dsn = parse_dsn(provenance_db.dsn)
+ dsn["dbname"] = "test_provenance"
+ conf = {
+ "provenance": {"cls": "local", "with_path": with_path, "db": dsn,},
+ }
+ yaml.dump(conf, conffile.open("w"))
+ result = CliRunner().invoke(
+ swhmain, ["provenance", "--config-file", str(conffile), "create", "--drop"]
+ )
+ assert result.exit_code == 0, result.output
+
+ # this will fail because the db already exists
+ result = CliRunner().invoke(
+ swhmain, ["provenance", "--config-file", str(conffile), "create"]
+ )
+ assert result.exit_code == 1, result.output

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 3:23 PM (1 h, 22 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218263

Event Timeline