Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163778
D5072.id18277.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D5072.id18277.diff
View Options
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -3,3 +3,34 @@
Provenance DB module to query the provenance of source code artifacts present
in the Software Heritage archive.
+
+This project allows to build such a provenance db from the Software Heritage
+Archive, and query this database.
+
+## Building a provenance database
+
+Building the provenance database requires a read access to the Software
+Heritage archive, either via a direct access to the database (preferred for
+better performances), or using the RPC API to a Software Heritage Storage
+instance.
+
+It also need a postgresql database in which the provenance db will be written
+into.
+
+A configuration file is needed with with the access to both these databases:
+
+```
+archive:
+ cls: api
+ storage:
+ cls: remote
+ url: http://uffizi.internal.softwareheritage.org:5002
+
+provenance:
+ cls: ps
+ db:
+ dbname: provenance
+ host: localhost
+
+
+```
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -10,7 +10,7 @@
def get_archive(cls: str, **kwargs) -> ArchiveInterface:
if cls == "api":
return ArchiveStorage(**kwargs["storage"])
- elif cls == "ps":
+ elif cls == "direct":
conn = connect(kwargs["db"])
return ArchivePostgreSQL(conn)
else:
@@ -18,11 +18,11 @@
def get_provenance(cls: str, **kwargs) -> ProvenanceInterface:
- if cls == "ps":
+ if cls == "local":
conn = connect(kwargs["db"])
- return ProvenancePostgreSQL(conn)
- elif cls == "ps_np":
- conn = connect(kwargs["db"])
- return ProvenancePostgreSQLNoPath(conn)
+ if kwargs.get("with_path", True):
+ return ProvenancePostgreSQL(conn)
+ else:
+ return ProvenancePostgreSQLNoPath(conn)
else:
raise NotImplementedError
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -10,6 +10,7 @@
import click
import yaml
+from psycopg2.extensions import parse_dsn
from swh.core import config
from swh.core.cli import CONTEXT_SETTINGS
@@ -106,26 +107,35 @@
@cli.command(name="create")
-@click.option("--name", default=None)
+@click.option("--maintenance-db", default=None)
+@click.option("--drop/--no-drop", "drop_db", default=False)
@click.pass_context
-def create(ctx, name):
+def create(ctx, maintenance_db, drop_db):
"""Create new provenance database."""
from .postgresql.db_utils import connect
+ if ctx.obj["config"]["provenance"]["cls"] != "local":
+ raise ValueError(
+ "Unsupported provenance db cls: %s"
+ % (ctx.obj["config"]["provenance"]["cls"])
+ )
+
# Connect to server without selecting a database
- conninfo = ctx.obj["config"]["provenance"]["db"]
- conn = connect(conninfo)
+ dsn = ctx.obj["config"]["provenance"]["db"]
+ if isinstance(dsn, str):
+ dsn = parse_dsn(dsn)
+ dbname = dsn.pop("dbname")
+ if maintenance_db:
+ dsn["dbname"] = maintenance_db
- if ctx.obj["config"]["provenance"]["cls"] == "ps":
- from .postgresql.provenance import create_database
+ conn = connect(dsn)
- create_database(conn, conninfo, name)
- elif ctx.obj["config"]["provenance"]["cls"] == "ps_np":
+ if ctx.obj["config"]["provenance"].get("with_path"):
+ from .postgresql.provenance import create_database
+ else:
from .postgresql_nopath.provenance import create_database
- create_database(conn, conninfo, name)
- else:
- raise NotImplementedError
+ create_database(conn, dbname, drop_db)
@cli.command(name="iter-revisions")
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -19,7 +19,9 @@
return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path
-def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
+def create_database(
+ conn: psycopg2.extensions.connection, name: str, drop_db: bool = True
+):
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
# Normalize dbname to avoid issues when reconnecting below
@@ -27,11 +29,13 @@
# Create new database dropping previous one if exists
cursor = conn.cursor()
- cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
+ if drop_db:
+ cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
cursor.execute(f"""CREATE DATABASE {name}""")
conn.close()
# Reconnect to server selecting newly created database to add tables
+ conninfo = psycopg2.extensions.parse_dsn(conn.dsn)
conninfo["dbname"] = name
conn = connect(conninfo)
diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py
--- a/swh/provenance/postgresql_nopath/provenance.py
+++ b/swh/provenance/postgresql_nopath/provenance.py
@@ -15,7 +15,9 @@
from ..revision import RevisionEntry
-def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
+def create_database(
+ conn: psycopg2.extensions.connection, name: str, drop_db: bool = True
+):
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
# Normalize dbname to avoid issues when reconnecting below
@@ -23,11 +25,13 @@
# Create new database dropping previous one if exists
cursor = conn.cursor()
- cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
+ if drop_db:
+ cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
cursor.execute(f"""CREATE DATABASE {name}""")
conn.close()
# Reconnect to server selecting newly created database to add tables
+ conninfo = psycopg2.extensions.parse_dsn(conn.dsn)
conninfo["dbname"] = name
conn = connect(conninfo)
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_cli.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+import yaml
+from click.testing import CliRunner
+from psycopg2.extensions import parse_dsn
+
+import swh.provenance.cli # noqa ; ensure cli is loaded
+from swh.core.cli import swh as swhmain
+from swh.core.db.pytest_plugin import postgresql_fact
+
+pytest_plugins = ["swh.storage.pytest_plugin"]
+
+provenance_db = postgresql_fact("postgresql_proc", db_name="provenance",)
+
+
+def test_cli_swh_db_help():
+ # swhmain.add_command(provenance_cli)
+ result = CliRunner().invoke(swhmain, ["provenance", "-h"])
+ assert result.exit_code == 0
+ assert "Commands:" in result.output
+ commands = result.output.split("Commands:")[1]
+ for command in (
+ "create",
+ "find-all",
+ "find-first",
+ "iter-origins",
+ "iter-revisions",
+ ):
+ assert f" {command} " in commands
+
+
+@pytest.mark.parametrize("with_path", (True, False))
+def test_cli_create(provenance_db, tmp_path, with_path):
+ conffile = tmp_path / "config.yml"
+ dsn = parse_dsn(provenance_db.dsn)
+ dsn["dbname"] = "test_provenance"
+ conf = {
+ "provenance": {"cls": "local", "with_path": with_path, "db": dsn,},
+ }
+ yaml.dump(conf, conffile.open("w"))
+ result = CliRunner().invoke(
+ swhmain, ["provenance", "--config-file", str(conffile), "create", "--drop"]
+ )
+ assert result.exit_code == 0, result.output
+
+ # this will fail because the db already exists
+ result = CliRunner().invoke(
+ swhmain, ["provenance", "--config-file", str(conffile), "create"]
+ )
+ assert result.exit_code == 1, result.output
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 3:23 PM (1 h, 22 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218263
Attached To
D5072: Refactor the db scaffolding to use swh.core.db
Event Timeline
Log In to Comment