diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -3,3 +3,34 @@ Provenance DB module to query the provenance of source code artifacts present in the Software Heritage archive. + +This project allows to build such a provenance db from the Software Heritage +Archive, and query this database. + +## Building a provenance database + +Building the provenance database requires a read access to the Software +Heritage archive, either via a direct access to the database (preferred for +better performances), or using the RPC API to a Software Heritage Storage +instance. + +It also need a postgresql database in which the provenance db will be written +into. + +A configuration file is needed with with the access to both these databases: + +``` +archive: + cls: api + storage: + cls: remote + url: http://uffizi.internal.softwareheritage.org:5002 + +provenance: + cls: ps + db: + dbname: provenance + host: localhost + + +``` diff --git a/conftest.py b/conftest.py new file mode 100644 --- /dev/null +++ b/conftest.py @@ -0,0 +1 @@ +pytest_plugins = ["swh.core.db.pytest_plugin", "swh.storage.pytest_plugin"] diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ # Add here internal Software Heritage dependencies, one per line. -swh.core +swh.core >= 0.12 swh.model -swh.storage \ No newline at end of file +swh.storage diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -10,7 +10,7 @@ def get_archive(cls: str, **kwargs) -> ArchiveInterface: if cls == "api": return ArchiveStorage(**kwargs["storage"]) - elif cls == "ps": + elif cls == "direct": conn = connect(kwargs["db"]) return ArchivePostgreSQL(conn) else: @@ -18,11 +18,11 @@ def get_provenance(cls: str, **kwargs) -> ProvenanceInterface: - if cls == "ps": + if cls == "local": conn = connect(kwargs["db"]) - return ProvenancePostgreSQL(conn) - elif cls == "ps_np": - conn = connect(kwargs["db"]) - return ProvenancePostgreSQLNoPath(conn) + if kwargs.get("with_path", True): + return ProvenancePostgreSQL(conn) + else: + return ProvenancePostgreSQLNoPath(conn) else: raise NotImplementedError diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py --- a/swh/provenance/cli.py +++ b/swh/provenance/cli.py @@ -105,27 +105,17 @@ atexit.register(exit) -@cli.command(name="create") -@click.option("--name", default=None) +@cli.command(name="create", deprecated=True) +@click.option("--maintenance-db", default=None) +@click.option("--drop/--no-drop", "drop_db", default=False) @click.pass_context -def create(ctx, name): - """Create new provenance database.""" - from .postgresql.db_utils import connect - - # Connect to server without selecting a database - conninfo = ctx.obj["config"]["provenance"]["db"] - conn = connect(conninfo) - - if ctx.obj["config"]["provenance"]["cls"] == "ps": - from .postgresql.provenance import create_database - - create_database(conn, conninfo, name) - elif ctx.obj["config"]["provenance"]["cls"] == "ps_np": - from .postgresql_nopath.provenance import create_database - - create_database(conn, conninfo, name) - else: - raise NotImplementedError +def create(ctx, maintenance_db, drop_db): + """Deprecated, please use: + swh db create provenance + and + swh db init provenance + instead. + """ @cli.command(name="iter-revisions") diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py --- a/swh/provenance/postgresql/provenance.py +++ b/swh/provenance/postgresql/provenance.py @@ -18,27 +18,6 @@ def normalize(path: bytes) -> bytes: return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path - -def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str): - conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) - - # Normalize dbname to avoid issues when reconnecting below - name = name.casefold() - - # Create new database dropping previous one if exists - cursor = conn.cursor() - cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") - cursor.execute(f"""CREATE DATABASE {name}""") - conn.close() - - # Reconnect to server selecting newly created database to add tables - conninfo["dbname"] = name - conn = connect(conninfo) - - sqldir = os.path.dirname(os.path.realpath(__file__)) - execute_sql(conn, os.path.join(sqldir, "provenance.sql")) - - ######################################################################################## ######################################################################################## ######################################################################################## diff --git a/swh/provenance/postgresql/provenance.sql b/swh/provenance/postgresql/provenance.sql deleted file mode 100644 --- a/swh/provenance/postgresql/provenance.sql +++ /dev/null @@ -1,150 +0,0 @@ --- a Git object ID, i.e., a Git-style salted SHA1 checksum -drop domain if exists sha1_git cascade; -create domain sha1_git as bytea check (length(value) = 20); - --- UNIX path (absolute, relative, individual path component, etc.) -drop domain if exists unix_path cascade; -create domain unix_path as bytea; - - -drop table if exists content; -create table content -( - id bigserial primary key, -- internal identifier of the content blob - sha1 sha1_git unique not null, -- intrinsic identifier of the content blob - date timestamptz not null -- timestamp of the revision where the blob appears early -); - -comment on column content.id is 'Content internal identifier'; -comment on column content.sha1 is 'Content intrinsic identifier'; -comment on column content.date is 'Earliest timestamp for the content (first seen time)'; - - -drop table if exists content_early_in_rev; -create table content_early_in_rev -( - blob bigint not null, -- internal identifier of the content blob - rev bigint not null, -- internal identifier of the revision where the blob appears for the first time - loc bigint not null, -- location of the content relative to the revision root directory - primary key (blob, rev, loc) - -- foreign key (blob) references content (id), - -- foreign key (rev) references revision (id), - -- foreign key (loc) references location (id) -); - -comment on column content_early_in_rev.blob is 'Content internal identifier'; -comment on column content_early_in_rev.rev is 'Revision internal identifier'; -comment on column content_early_in_rev.loc is 'Location of content in revision'; - - -drop table if exists content_in_dir; -create table content_in_dir -( - blob bigint not null, -- internal identifier of the content blob - dir bigint not null, -- internal identifier of the directory containing the blob - loc bigint not null, -- location of the content relative to its parent directory in the isochrone frontier - primary key (blob, dir, loc) - -- foreign key (blob) references content (id), - -- foreign key (dir) references directory (id), - -- foreign key (loc) references location (id) -); - -comment on column content_in_dir.blob is 'Content internal identifier'; -comment on column content_in_dir.dir is 'Directory internal identifier'; -comment on column content_in_dir.loc is 'Location of content in directory'; - - -drop table if exists directory; -create table directory -( - id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier - sha1 sha1_git unique not null, -- intrinsic identifier of the directory - date timestamptz not null -- max timestamp among those of the directory children's -); - -comment on column directory.id is 'Directory internal identifier'; -comment on column directory.sha1 is 'Directory intrinsic identifier'; -comment on column directory.date is 'Latest timestamp for the content in the directory'; - - -drop table if exists directory_in_rev; -create table directory_in_rev -( - dir bigint not null, -- internal identifier of the directory appearing in the revision - rev bigint not null, -- internal identifier of the revision containing the directory - loc bigint not null, -- location of the directory relative to the revision root directory - primary key (dir, rev, loc) - -- foreign key (dir) references directory (id), - -- foreign key (rev) references revision (id), - -- foreign key (loc) references location (id) -); - -comment on column directory_in_rev.dir is 'Directory internal identifier'; -comment on column directory_in_rev.rev is 'Revision internal identifier'; -comment on column directory_in_rev.loc is 'Location of directory in revision'; - - -drop table if exists location; -create table location -( - id bigserial primary key, -- internal identifier of the location - path unix_path unique not null -- path to the location -); - -comment on column location.id is 'Location internal identifier'; -comment on column location.path is 'Path to the location'; - - -drop table if exists origin; -create table origin -( - id bigserial primary key, -- internal identifier of the origin - url unix_path unique not null -- url of the origin -); - -comment on column origin.id is 'Origin internal identifier'; -comment on column origin.url is 'URL of the origin'; - - -drop table if exists revision; -create table revision -( - id bigserial primary key, -- internal identifier of the revision - sha1 sha1_git unique not null, -- intrinsic identifier of the revision - date timestamptz not null, -- timestamp of the revision - org bigint -- id of the preferred origin - -- foreign key (org) references origin (id) -); - -comment on column revision.id is 'Revision internal identifier'; -comment on column revision.sha1 is 'Revision intrinsic identifier'; -comment on column revision.date is 'Revision timestamp'; -comment on column revision.org is 'preferred origin for the revision'; - - -drop table if exists revision_before_rev; -create table revision_before_rev -( - prev bigserial not null, -- internal identifier of the source revision - next bigserial not null, -- internal identifier of the destination revision - primary key (prev, next) - -- foreign key (prev) references revision (id), - -- foreign key (next) references revision (id) -); - -comment on column revision_before_rev.prev is 'Source revision internal identifier'; -comment on column revision_before_rev.next is 'Destination revision internal identifier'; - - -drop table if exists revision_in_org; -create table revision_in_org -( - rev bigint not null, -- internal identifier of the revision poined by the origin - org bigint not null, -- internal identifier of the origin that points to the revision - primary key (rev, org) - -- foreign key (rev) references revision (id), - -- foreign key (org) references origin (id) -); - -comment on column revision_in_org.rev is 'Revision internal identifier'; -comment on column revision_in_org.org is 'Origin internal identifier'; diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py --- a/swh/provenance/postgresql_nopath/provenance.py +++ b/swh/provenance/postgresql_nopath/provenance.py @@ -15,26 +15,6 @@ from ..revision import RevisionEntry -def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str): - conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) - - # Normalize dbname to avoid issues when reconnecting below - name = name.casefold() - - # Create new database dropping previous one if exists - cursor = conn.cursor() - cursor.execute(f"""DROP DATABASE IF EXISTS {name}""") - cursor.execute(f"""CREATE DATABASE {name}""") - conn.close() - - # Reconnect to server selecting newly created database to add tables - conninfo["dbname"] = name - conn = connect(conninfo) - - sqldir = os.path.dirname(os.path.realpath(__file__)) - execute_sql(conn, os.path.join(sqldir, "provenance.sql")) - - ######################################################################################## ######################################################################################## ######################################################################################## diff --git a/swh/provenance/postgresql_nopath/provenance.sql b/swh/provenance/sql/30-schema.sql rename from swh/provenance/postgresql_nopath/provenance.sql rename to swh/provenance/sql/30-schema.sql --- a/swh/provenance/postgresql_nopath/provenance.sql +++ b/swh/provenance/sql/30-schema.sql @@ -1,92 +1,37 @@ -- a Git object ID, i.e., a Git-style salted SHA1 checksum -drop domain if exists sha1_git cascade; create domain sha1_git as bytea check (length(value) = 20); -- UNIX path (absolute, relative, individual path component, etc.) -drop domain if exists unix_path cascade; create domain unix_path as bytea; - -drop table if exists content; create table content ( id bigserial primary key, -- internal identifier of the content blob sha1 sha1_git unique not null, -- intrinsic identifier of the content blob date timestamptz not null -- timestamp of the revision where the blob appears early ); - comment on column content.id is 'Content internal identifier'; comment on column content.sha1 is 'Content intrinsic identifier'; comment on column content.date is 'Earliest timestamp for the content (first seen time)'; - -drop table if exists content_early_in_rev; -create table content_early_in_rev -( - blob bigint not null, -- internal identifier of the content blob - rev bigint not null, -- internal identifier of the revision where the blob appears for the first time - primary key (blob, rev) - -- foreign key (blob) references content (id), - -- foreign key (rev) references revision (id) -); - -comment on column content_early_in_rev.blob is 'Content internal identifier'; -comment on column content_early_in_rev.rev is 'Revision internal identifier'; - - -drop table if exists content_in_dir; -create table content_in_dir -( - blob bigint not null, -- internal identifier of the content blob - dir bigint not null, -- internal identifier of the directory containing the blob - primary key (blob, dir) - -- foreign key (blob) references content (id), - -- foreign key (dir) references directory (id) -); - -comment on column content_in_dir.blob is 'Content internal identifier'; -comment on column content_in_dir.dir is 'Directory internal identifier'; - - -drop table if exists directory; create table directory ( id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier sha1 sha1_git unique not null, -- intrinsic identifier of the directory date timestamptz not null -- max timestamp among those of the directory children's ); - comment on column directory.id is 'Directory internal identifier'; comment on column directory.sha1 is 'Directory intrinsic identifier'; comment on column directory.date is 'Latest timestamp for the content in the directory'; - -drop table if exists directory_in_rev; -create table directory_in_rev -( - dir bigint not null, -- internal identifier of the directory appearing in the revision - rev bigint not null, -- internal identifier of the revision containing the directory - primary key (dir, rev) - -- foreign key (dir) references directory (id), - -- foreign key (rev) references revision (id) -); - -comment on column directory_in_rev.dir is 'Directory internal identifier'; -comment on column directory_in_rev.rev is 'Revision internal identifier'; - - -drop table if exists origin; create table origin ( id bigserial primary key, -- internal identifier of the origin url unix_path unique not null -- url of the origin ); - comment on column origin.id is 'Origin internal identifier'; comment on column origin.url is 'URL of the origin'; - -drop table if exists revision; create table revision ( id bigserial primary key, -- internal identifier of the revision @@ -95,14 +40,11 @@ org bigint -- id of the preferred origin -- foreign key (org) references origin (id) ); - comment on column revision.id is 'Revision internal identifier'; comment on column revision.sha1 is 'Revision intrinsic identifier'; comment on column revision.date is 'Revision timestamp'; comment on column revision.org is 'preferred origin for the revision'; - -drop table if exists revision_before_rev; create table revision_before_rev ( prev bigserial not null, -- internal identifier of the source revision @@ -111,12 +53,9 @@ -- foreign key (prev) references revision (id), -- foreign key (next) references revision (id) ); - comment on column revision_before_rev.prev is 'Source revision internal identifier'; comment on column revision_before_rev.next is 'Destination revision internal identifier'; - -drop table if exists revision_in_org; create table revision_in_org ( rev bigint not null, -- internal identifier of the revision poined by the origin @@ -125,6 +64,5 @@ -- foreign key (rev) references revision (id), -- foreign key (org) references origin (id) ); - comment on column revision_in_org.rev is 'Revision internal identifier'; comment on column revision_in_org.org is 'Origin internal identifier'; diff --git a/swh/provenance/sql/35-schema-with-path-flavor.sql b/swh/provenance/sql/35-schema-with-path-flavor.sql new file mode 100644 --- /dev/null +++ b/swh/provenance/sql/35-schema-with-path-flavor.sql @@ -0,0 +1,46 @@ +create table content_early_in_rev +( + blob bigint not null, -- internal identifier of the content blob + rev bigint not null, -- internal identifier of the revision where the blob appears for the first time + loc bigint not null -- location of the content relative to the revision root directory + -- foreign key (blob) references content (id), + -- foreign key (rev) references revision (id), + -- foreign key (loc) references location (id) +); +comment on column content_early_in_rev.blob is 'Content internal identifier'; +comment on column content_early_in_rev.rev is 'Revision internal identifier'; +comment on column content_early_in_rev.loc is 'Location of content in revision'; + +create table content_in_dir +( + blob bigint not null, -- internal identifier of the content blob + dir bigint not null, -- internal identifier of the directory containing the blob + loc bigint not null -- location of the content relative to its parent directory in the isochrone frontier + -- foreign key (blob) references content (id), + -- foreign key (dir) references directory (id), + -- foreign key (loc) references location (id) +); +comment on column content_in_dir.blob is 'Content internal identifier'; +comment on column content_in_dir.dir is 'Directory internal identifier'; +comment on column content_in_dir.loc is 'Location of content in directory'; + +create table directory_in_rev +( + dir bigint not null, -- internal identifier of the directory appearing in the revision + rev bigint not null, -- internal identifier of the revision containing the directory + loc bigint not null -- location of the directory relative to the revision root directory + -- foreign key (dir) references directory (id), + -- foreign key (rev) references revision (id), + -- foreign key (loc) references location (id) +); +comment on column directory_in_rev.dir is 'Directory internal identifier'; +comment on column directory_in_rev.rev is 'Revision internal identifier'; +comment on column directory_in_rev.loc is 'Location of directory in revision'; + +create table location +( + id bigserial primary key, -- internal identifier of the location + path unix_path unique not null -- path to the location +); +comment on column location.id is 'Location internal identifier'; +comment on column location.path is 'Path to the location'; diff --git a/swh/provenance/sql/35-schema-without-path-flavor.sql b/swh/provenance/sql/35-schema-without-path-flavor.sql new file mode 100644 --- /dev/null +++ b/swh/provenance/sql/35-schema-without-path-flavor.sql @@ -0,0 +1,29 @@ +create table content_early_in_rev +( + blob bigint not null, -- internal identifier of the content blob + rev bigint not null -- internal identifier of the revision where the blob appears for the first time + -- foreign key (blob) references content (id), + -- foreign key (rev) references revision (id) +); +comment on column content_early_in_rev.blob is 'Content internal identifier'; +comment on column content_early_in_rev.rev is 'Revision internal identifier'; + +create table content_in_dir +( + blob bigint not null, -- internal identifier of the content blob + dir bigint not null -- internal identifier of the directory containing the blob + -- foreign key (blob) references content (id), + -- foreign key (dir) references directory (id) +); +comment on column content_in_dir.blob is 'Content internal identifier'; +comment on column content_in_dir.dir is 'Directory internal identifier'; + +create table directory_in_rev +( + dir bigint not null, -- internal identifier of the directory appearing in the revision + rev bigint not null -- internal identifier of the revision containing the directory + -- foreign key (dir) references directory (id), + -- foreign key (rev) references revision (id) +); +comment on column directory_in_rev.dir is 'Directory internal identifier'; +comment on column directory_in_rev.rev is 'Revision internal identifier'; diff --git a/swh/provenance/sql/60-indexes-with-path-flavor.sql b/swh/provenance/sql/60-indexes-with-path-flavor.sql new file mode 100644 --- /dev/null +++ b/swh/provenance/sql/60-indexes-with-path-flavor.sql @@ -0,0 +1,3 @@ +alter table content_early_in_rev add primary key (blob, rev, loc); +alter table content_in_dir add primary key (blob, dir, loc); +alter table directory_in_rev add primary key (dir, rev, loc); diff --git a/swh/provenance/sql/60-indexes-without-path-flavor.sql b/swh/provenance/sql/60-indexes-without-path-flavor.sql new file mode 100644 --- /dev/null +++ b/swh/provenance/sql/60-indexes-without-path-flavor.sql @@ -0,0 +1,3 @@ +alter table content_early_in_rev add primary key (blob, rev); +alter table content_in_dir add primary key (blob, dir); +alter table directory_in_rev add primary key (dir, rev); diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/conftest.py @@ -0,0 +1,28 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import glob +from os import path + +import pytest + +from swh.core.db.pytest_plugin import postgresql_fact +from swh.core.utils import numfile_sortkey as sortkey +import swh.provenance + + +SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql") +SQL_FILES = [sqlfile for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey) + if '-without-path-' not in sqlfile] + +provenance_db = postgresql_fact( + "postgresql_proc", db_name="provenance", dump_files=SQL_FILES) + + +@pytest.fixture +def provenance(provenance_db): + """return a working and initialized provenance db""" + from swh.provenance.postgresql.provenance import ProvenancePostgreSQL as ProvenanceDB + return ProvenanceDB(provenance_db) diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_cli.py @@ -0,0 +1,39 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from click.testing import CliRunner +import yaml + +from swh.core.cli import swh as swhmain +import swh.provenance.cli # noqa ; ensure cli is loaded + + +def test_cli_swh_db_help(): + # swhmain.add_command(provenance_cli) + result = CliRunner().invoke(swhmain, ["provenance", "-h"]) + assert result.exit_code == 0 + assert "Commands:" in result.output + commands = result.output.split("Commands:")[1] + for command in ( + "create", + "find-all", + "find-first", + "iter-origins", + "iter-revisions", + ): + assert f" {command} " in commands + + +def test_cli_create_deprecated(provenance_db, tmp_path): + conffile = tmp_path / "config.yml" + conf = { + "provenance": {"cls": "local", "with_path": True,}, + } + yaml.dump(conf, conffile.open("w")) + result = CliRunner().invoke( + swhmain, ["provenance", "--config-file", str(conffile), "create", "--drop"] + ) + assert result.exit_code == 0, result.output + assert "DeprecationWarning" in result.output diff --git a/swh/provenance/tests/test_conftest.py b/swh/provenance/tests/test_conftest.py new file mode 100644 --- /dev/null +++ b/swh/provenance/tests/test_conftest.py @@ -0,0 +1,9 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_provenance_fixture(provenance): + assert provenance + provenance.insert_all() # should be a noop