Page MenuHomeSoftware Heritage

D5072.id18156.diff
No OneTemporary

D5072.id18156.diff

diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -3,3 +3,34 @@
Provenance DB module to query the provenance of source code artifacts present
in the Software Heritage archive.
+
+This project allows to build such a provenance db from the Software Heritage
+Archive, and query this database.
+
+## Building a provenance database
+
+Building the provenance database requires a read access to the Software
+Heritage archive, either via a direct access to the database (preferred for
+better performances), or using the RPC API to a Software Heritage Storage
+instance.
+
+It also need a postgresql database in which the provenance db will be written
+into.
+
+A configuration file is needed with with the access to both these databases:
+
+```
+archive:
+ cls: api
+ storage:
+ cls: remote
+ url: http://uffizi.internal.softwareheritage.org:5002
+
+provenance:
+ cls: ps
+ db:
+ dbname: provenance
+ host: localhost
+
+
+```
diff --git a/conftest.py b/conftest.py
new file mode 100644
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1 @@
+pytest_plugins = ["swh.core.db.pytest_plugin", "swh.storage.pytest_plugin"]
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core
+swh.core >= 0.12
swh.model
-swh.storage
\ No newline at end of file
+swh.storage
diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py
--- a/swh/provenance/__init__.py
+++ b/swh/provenance/__init__.py
@@ -10,7 +10,7 @@
def get_archive(cls: str, **kwargs) -> ArchiveInterface:
if cls == "api":
return ArchiveStorage(**kwargs["storage"])
- elif cls == "ps":
+ elif cls == "direct":
conn = connect(kwargs["db"])
return ArchivePostgreSQL(conn)
else:
@@ -18,11 +18,11 @@
def get_provenance(cls: str, **kwargs) -> ProvenanceInterface:
- if cls == "ps":
+ if cls == "local":
conn = connect(kwargs["db"])
- return ProvenancePostgreSQL(conn)
- elif cls == "ps_np":
- conn = connect(kwargs["db"])
- return ProvenancePostgreSQLNoPath(conn)
+ if kwargs.get("with_path", False):
+ return ProvenancePostgreSQL(conn)
+ else:
+ return ProvenancePostgreSQLNoPath(conn)
else:
raise NotImplementedError
diff --git a/swh/provenance/cli.py b/swh/provenance/cli.py
--- a/swh/provenance/cli.py
+++ b/swh/provenance/cli.py
@@ -105,27 +105,17 @@
atexit.register(exit)
-@cli.command(name="create")
-@click.option("--name", default=None)
+@cli.command(name="create", deprecated=True)
+@click.option("--maintenance-db", default=None)
+@click.option("--drop/--no-drop", "drop_db", default=False)
@click.pass_context
-def create(ctx, name):
- """Create new provenance database."""
- from .postgresql.db_utils import connect
-
- # Connect to server without selecting a database
- conninfo = ctx.obj["config"]["provenance"]["db"]
- conn = connect(conninfo)
-
- if ctx.obj["config"]["provenance"]["cls"] == "ps":
- from .postgresql.provenance import create_database
-
- create_database(conn, conninfo, name)
- elif ctx.obj["config"]["provenance"]["cls"] == "ps_np":
- from .postgresql_nopath.provenance import create_database
-
- create_database(conn, conninfo, name)
- else:
- raise NotImplementedError
+def create(ctx, maintenance_db, drop_db):
+ """Deprecated, please use:
+ swh db create provenance
+ and
+ swh db init provenance
+ instead.
+ """
@cli.command(name="iter-revisions")
diff --git a/swh/provenance/postgresql/provenance.py b/swh/provenance/postgresql/provenance.py
--- a/swh/provenance/postgresql/provenance.py
+++ b/swh/provenance/postgresql/provenance.py
@@ -18,27 +18,6 @@
def normalize(path: bytes) -> bytes:
return path[2:] if path.startswith(bytes("." + os.path.sep, "utf-8")) else path
-
-def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
- conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
-
- # Normalize dbname to avoid issues when reconnecting below
- name = name.casefold()
-
- # Create new database dropping previous one if exists
- cursor = conn.cursor()
- cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
- cursor.execute(f"""CREATE DATABASE {name}""")
- conn.close()
-
- # Reconnect to server selecting newly created database to add tables
- conninfo["dbname"] = name
- conn = connect(conninfo)
-
- sqldir = os.path.dirname(os.path.realpath(__file__))
- execute_sql(conn, os.path.join(sqldir, "provenance.sql"))
-
-
########################################################################################
########################################################################################
########################################################################################
diff --git a/swh/provenance/postgresql/provenance.sql b/swh/provenance/postgresql/provenance.sql
deleted file mode 100644
--- a/swh/provenance/postgresql/provenance.sql
+++ /dev/null
@@ -1,150 +0,0 @@
--- a Git object ID, i.e., a Git-style salted SHA1 checksum
-drop domain if exists sha1_git cascade;
-create domain sha1_git as bytea check (length(value) = 20);
-
--- UNIX path (absolute, relative, individual path component, etc.)
-drop domain if exists unix_path cascade;
-create domain unix_path as bytea;
-
-
-drop table if exists content;
-create table content
-(
- id bigserial primary key, -- internal identifier of the content blob
- sha1 sha1_git unique not null, -- intrinsic identifier of the content blob
- date timestamptz not null -- timestamp of the revision where the blob appears early
-);
-
-comment on column content.id is 'Content internal identifier';
-comment on column content.sha1 is 'Content intrinsic identifier';
-comment on column content.date is 'Earliest timestamp for the content (first seen time)';
-
-
-drop table if exists content_early_in_rev;
-create table content_early_in_rev
-(
- blob bigint not null, -- internal identifier of the content blob
- rev bigint not null, -- internal identifier of the revision where the blob appears for the first time
- loc bigint not null, -- location of the content relative to the revision root directory
- primary key (blob, rev, loc)
- -- foreign key (blob) references content (id),
- -- foreign key (rev) references revision (id),
- -- foreign key (loc) references location (id)
-);
-
-comment on column content_early_in_rev.blob is 'Content internal identifier';
-comment on column content_early_in_rev.rev is 'Revision internal identifier';
-comment on column content_early_in_rev.loc is 'Location of content in revision';
-
-
-drop table if exists content_in_dir;
-create table content_in_dir
-(
- blob bigint not null, -- internal identifier of the content blob
- dir bigint not null, -- internal identifier of the directory containing the blob
- loc bigint not null, -- location of the content relative to its parent directory in the isochrone frontier
- primary key (blob, dir, loc)
- -- foreign key (blob) references content (id),
- -- foreign key (dir) references directory (id),
- -- foreign key (loc) references location (id)
-);
-
-comment on column content_in_dir.blob is 'Content internal identifier';
-comment on column content_in_dir.dir is 'Directory internal identifier';
-comment on column content_in_dir.loc is 'Location of content in directory';
-
-
-drop table if exists directory;
-create table directory
-(
- id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier
- sha1 sha1_git unique not null, -- intrinsic identifier of the directory
- date timestamptz not null -- max timestamp among those of the directory children's
-);
-
-comment on column directory.id is 'Directory internal identifier';
-comment on column directory.sha1 is 'Directory intrinsic identifier';
-comment on column directory.date is 'Latest timestamp for the content in the directory';
-
-
-drop table if exists directory_in_rev;
-create table directory_in_rev
-(
- dir bigint not null, -- internal identifier of the directory appearing in the revision
- rev bigint not null, -- internal identifier of the revision containing the directory
- loc bigint not null, -- location of the directory relative to the revision root directory
- primary key (dir, rev, loc)
- -- foreign key (dir) references directory (id),
- -- foreign key (rev) references revision (id),
- -- foreign key (loc) references location (id)
-);
-
-comment on column directory_in_rev.dir is 'Directory internal identifier';
-comment on column directory_in_rev.rev is 'Revision internal identifier';
-comment on column directory_in_rev.loc is 'Location of directory in revision';
-
-
-drop table if exists location;
-create table location
-(
- id bigserial primary key, -- internal identifier of the location
- path unix_path unique not null -- path to the location
-);
-
-comment on column location.id is 'Location internal identifier';
-comment on column location.path is 'Path to the location';
-
-
-drop table if exists origin;
-create table origin
-(
- id bigserial primary key, -- internal identifier of the origin
- url unix_path unique not null -- url of the origin
-);
-
-comment on column origin.id is 'Origin internal identifier';
-comment on column origin.url is 'URL of the origin';
-
-
-drop table if exists revision;
-create table revision
-(
- id bigserial primary key, -- internal identifier of the revision
- sha1 sha1_git unique not null, -- intrinsic identifier of the revision
- date timestamptz not null, -- timestamp of the revision
- org bigint -- id of the preferred origin
- -- foreign key (org) references origin (id)
-);
-
-comment on column revision.id is 'Revision internal identifier';
-comment on column revision.sha1 is 'Revision intrinsic identifier';
-comment on column revision.date is 'Revision timestamp';
-comment on column revision.org is 'Prefered origin for the revision';
-
-
-drop table if exists revision_before_rev;
-create table revision_before_rev
-(
- prev bigserial not null, -- internal identifier of the source revision
- next bigserial not null, -- internal identifier of the destination revision
- primary key (prev, next)
- -- foreign key (prev) references revision (id),
- -- foreign key (next) references revision (id)
-);
-
-comment on column revision_before_rev.prev is 'Source revision internal identifier';
-comment on column revision_before_rev.next is 'Destination revision internal identifier';
-
-
-drop table if exists revision_in_org;
-create table revision_in_org
-(
- rev bigint not null, -- internal identifier of the revision poined by the origin
- org bigint not null, -- internal identifier of the origin that points to the revision
- primary key (rev, org)
- -- foreign key (rev) references revision (id),
- -- foreign key (org) references origin (id)
-);
-
-comment on column revision_in_org.rev is 'Revision internal identifier';
-comment on column revision_in_org.org is 'Origin internal identifier';
diff --git a/swh/provenance/postgresql_nopath/provenance.py b/swh/provenance/postgresql_nopath/provenance.py
--- a/swh/provenance/postgresql_nopath/provenance.py
+++ b/swh/provenance/postgresql_nopath/provenance.py
@@ -15,26 +15,6 @@
from ..revision import RevisionEntry
-def create_database(conn: psycopg2.extensions.connection, conninfo: dict, name: str):
- conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
-
- # Normalize dbname to avoid issues when reconnecting below
- name = name.casefold()
-
- # Create new database dropping previous one if exists
- cursor = conn.cursor()
- cursor.execute(f"""DROP DATABASE IF EXISTS {name}""")
- cursor.execute(f"""CREATE DATABASE {name}""")
- conn.close()
-
- # Reconnect to server selecting newly created database to add tables
- conninfo["dbname"] = name
- conn = connect(conninfo)
-
- sqldir = os.path.dirname(os.path.realpath(__file__))
- execute_sql(conn, os.path.join(sqldir, "provenance.sql"))
-
-
########################################################################################
########################################################################################
########################################################################################
diff --git a/swh/provenance/postgresql_nopath/provenance.sql b/swh/provenance/sql/30-schema.sql
rename from swh/provenance/postgresql_nopath/provenance.sql
rename to swh/provenance/sql/30-schema.sql
--- a/swh/provenance/postgresql_nopath/provenance.sql
+++ b/swh/provenance/sql/30-schema.sql
@@ -1,92 +1,37 @@
-- a Git object ID, i.e., a Git-style salted SHA1 checksum
-drop domain if exists sha1_git cascade;
create domain sha1_git as bytea check (length(value) = 20);
-- UNIX path (absolute, relative, individual path component, etc.)
-drop domain if exists unix_path cascade;
create domain unix_path as bytea;
-
-drop table if exists content;
create table content
(
id bigserial primary key, -- internal identifier of the content blob
sha1 sha1_git unique not null, -- intrinsic identifier of the content blob
date timestamptz not null -- timestamp of the revision where the blob appears early
);
-
comment on column content.id is 'Content internal identifier';
comment on column content.sha1 is 'Content intrinsic identifier';
comment on column content.date is 'Earliest timestamp for the content (first seen time)';
-
-drop table if exists content_early_in_rev;
-create table content_early_in_rev
-(
- blob bigint not null, -- internal identifier of the content blob
- rev bigint not null, -- internal identifier of the revision where the blob appears for the first time
- primary key (blob, rev)
- -- foreign key (blob) references content (id),
- -- foreign key (rev) references revision (id)
-);
-
-comment on column content_early_in_rev.blob is 'Content internal identifier';
-comment on column content_early_in_rev.rev is 'Revision internal identifier';
-
-
-drop table if exists content_in_dir;
-create table content_in_dir
-(
- blob bigint not null, -- internal identifier of the content blob
- dir bigint not null, -- internal identifier of the directory containing the blob
- primary key (blob, dir)
- -- foreign key (blob) references content (id),
- -- foreign key (dir) references directory (id)
-);
-
-comment on column content_in_dir.blob is 'Content internal identifier';
-comment on column content_in_dir.dir is 'Directory internal identifier';
-
-
-drop table if exists directory;
create table directory
(
id bigserial primary key, -- internal identifier of the directory appearing in an isochrone inner frontier
sha1 sha1_git unique not null, -- intrinsic identifier of the directory
date timestamptz not null -- max timestamp among those of the directory children's
);
-
comment on column directory.id is 'Directory internal identifier';
comment on column directory.sha1 is 'Directory intrinsic identifier';
comment on column directory.date is 'Latest timestamp for the content in the directory';
-
-drop table if exists directory_in_rev;
-create table directory_in_rev
-(
- dir bigint not null, -- internal identifier of the directory appearing in the revision
- rev bigint not null, -- internal identifier of the revision containing the directory
- primary key (dir, rev)
- -- foreign key (dir) references directory (id),
- -- foreign key (rev) references revision (id)
-);
-
-comment on column directory_in_rev.dir is 'Directory internal identifier';
-comment on column directory_in_rev.rev is 'Revision internal identifier';
-
-
-drop table if exists origin;
create table origin
(
id bigserial primary key, -- internal identifier of the origin
url unix_path unique not null -- url of the origin
);
-
comment on column origin.id is 'Origin internal identifier';
comment on column origin.url is 'URL of the origin';
-
-drop table if exists revision;
create table revision
(
id bigserial primary key, -- internal identifier of the revision
@@ -95,14 +40,11 @@
org bigint -- id of the preferred origin
-- foreign key (org) references origin (id)
);
-
comment on column revision.id is 'Revision internal identifier';
comment on column revision.sha1 is 'Revision intrinsic identifier';
comment on column revision.date is 'Revision timestamp';
comment on column revision.org is 'Prefered origin for the revision';
-
-drop table if exists revision_before_rev;
create table revision_before_rev
(
prev bigserial not null, -- internal identifier of the source revision
@@ -111,12 +53,9 @@
-- foreign key (prev) references revision (id),
-- foreign key (next) references revision (id)
);
-
comment on column revision_before_rev.prev is 'Source revision internal identifier';
comment on column revision_before_rev.next is 'Destination revision internal identifier';
-
-drop table if exists revision_in_org;
create table revision_in_org
(
rev bigint not null, -- internal identifier of the revision poined by the origin
@@ -125,6 +64,5 @@
-- foreign key (rev) references revision (id),
-- foreign key (org) references origin (id)
);
-
comment on column revision_in_org.rev is 'Revision internal identifier';
comment on column revision_in_org.org is 'Origin internal identifier';
diff --git a/swh/provenance/sql/35-schema-with-path-flavor.sql b/swh/provenance/sql/35-schema-with-path-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/provenance/sql/35-schema-with-path-flavor.sql
@@ -0,0 +1,46 @@
+create table content_early_in_rev
+(
+ blob bigint not null, -- internal identifier of the content blob
+ rev bigint not null, -- internal identifier of the revision where the blob appears for the first time
+ loc bigint not null -- location of the content relative to the revision root directory
+ -- foreign key (blob) references content (id),
+ -- foreign key (rev) references revision (id),
+ -- foreign key (loc) references location (id)
+);
+comment on column content_early_in_rev.blob is 'Content internal identifier';
+comment on column content_early_in_rev.rev is 'Revision internal identifier';
+comment on column content_early_in_rev.loc is 'Location of content in revision';
+
+create table content_in_dir
+(
+ blob bigint not null, -- internal identifier of the content blob
+ dir bigint not null, -- internal identifier of the directory containing the blob
+ loc bigint not null -- location of the content relative to its parent directory in the isochrone frontier
+ -- foreign key (blob) references content (id),
+ -- foreign key (dir) references directory (id),
+ -- foreign key (loc) references location (id)
+);
+comment on column content_in_dir.blob is 'Content internal identifier';
+comment on column content_in_dir.dir is 'Directory internal identifier';
+comment on column content_in_dir.loc is 'Location of content in directory';
+
+create table directory_in_rev
+(
+ dir bigint not null, -- internal identifier of the directory appearing in the revision
+ rev bigint not null, -- internal identifier of the revision containing the directory
+ loc bigint not null -- location of the directory relative to the revision root directory
+ -- foreign key (dir) references directory (id),
+ -- foreign key (rev) references revision (id),
+ -- foreign key (loc) references location (id)
+);
+comment on column directory_in_rev.dir is 'Directory internal identifier';
+comment on column directory_in_rev.rev is 'Revision internal identifier';
+comment on column directory_in_rev.loc is 'Location of directory in revision';
+
+create table location
+(
+ id bigserial primary key, -- internal identifier of the location
+ path unix_path unique not null -- path to the location
+);
+comment on column location.id is 'Location internal identifier';
+comment on column location.path is 'Path to the location';
diff --git a/swh/provenance/sql/35-schema-without-path-flavor.sql b/swh/provenance/sql/35-schema-without-path-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/provenance/sql/35-schema-without-path-flavor.sql
@@ -0,0 +1,29 @@
+create table content_early_in_rev
+(
+ blob bigint not null, -- internal identifier of the content blob
+ rev bigint not null -- internal identifier of the revision where the blob appears for the first time
+ -- foreign key (blob) references content (id),
+ -- foreign key (rev) references revision (id)
+);
+comment on column content_early_in_rev.blob is 'Content internal identifier';
+comment on column content_early_in_rev.rev is 'Revision internal identifier';
+
+create table content_in_dir
+(
+ blob bigint not null, -- internal identifier of the content blob
+ dir bigint not null -- internal identifier of the directory containing the blob
+ -- foreign key (blob) references content (id),
+ -- foreign key (dir) references directory (id)
+);
+comment on column content_in_dir.blob is 'Content internal identifier';
+comment on column content_in_dir.dir is 'Directory internal identifier';
+
+create table directory_in_rev
+(
+ dir bigint not null, -- internal identifier of the directory appearing in the revision
+ rev bigint not null -- internal identifier of the revision containing the directory
+ -- foreign key (dir) references directory (id),
+ -- foreign key (rev) references revision (id)
+);
+comment on column directory_in_rev.dir is 'Directory internal identifier';
+comment on column directory_in_rev.rev is 'Revision internal identifier';
diff --git a/swh/provenance/sql/60-indexes-with-path-flavor.sql b/swh/provenance/sql/60-indexes-with-path-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/provenance/sql/60-indexes-with-path-flavor.sql
@@ -0,0 +1,3 @@
+alter table content_early_in_rev add primary key (blob, rev, loc);
+alter table content_in_dir add primary key (blob, dir, loc);
+alter table directory_in_rev add primary key (dir, rev, loc);
diff --git a/swh/provenance/sql/60-indexes-without-path-flavor.sql b/swh/provenance/sql/60-indexes-without-path-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/provenance/sql/60-indexes-without-path-flavor.sql
@@ -0,0 +1,3 @@
+alter table content_early_in_rev add primary key (blob, rev);
+alter table content_in_dir add primary key (blob, dir);
+alter table directory_in_rev add primary key (dir, rev);
diff --git a/swh/provenance/tests/conftest.py b/swh/provenance/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/conftest.py
@@ -0,0 +1,28 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import glob
+from os import path
+
+import pytest
+
+from swh.core.db.pytest_plugin import postgresql_fact
+from swh.core.utils import numfile_sortkey as sortkey
+import swh.provenance
+
+
+SQL_DIR = path.join(path.dirname(swh.provenance.__file__), "sql")
+SQL_FILES = [sqlfile for sqlfile in sorted(glob.glob(path.join(SQL_DIR, "*.sql")), key=sortkey)
+ if '-without-path-' not in sqlfile]
+
+provenance_db = postgresql_fact(
+ "postgresql_proc", db_name="provenance", dump_files=SQL_FILES)
+
+
+@pytest.fixture
+def provenance(provenance_db):
+ """return a working and initialized provenance db"""
+ from swh.provenance.postgresql.provenance import ProvenancePostgreSQL as ProvenanceDB
+ return ProvenanceDB(provenance_db)
diff --git a/swh/provenance/tests/test_cli.py b/swh/provenance/tests/test_cli.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_cli.py
@@ -0,0 +1,39 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from click.testing import CliRunner
+import yaml
+
+from swh.core.cli import swh as swhmain
+import swh.provenance.cli # noqa ; ensure cli is loaded
+
+
+def test_cli_swh_db_help():
+ # swhmain.add_command(provenance_cli)
+ result = CliRunner().invoke(swhmain, ["provenance", "-h"])
+ assert result.exit_code == 0
+ assert "Commands:" in result.output
+ commands = result.output.split("Commands:")[1]
+ for command in (
+ "create",
+ "find-all",
+ "find-first",
+ "iter-origins",
+ "iter-revisions",
+ ):
+ assert f" {command} " in commands
+
+
+def test_cli_create_deprecated(provenance_db, tmp_path):
+ conffile = tmp_path / "config.yml"
+ conf = {
+ "provenance": {"cls": "local", "with_path": True,},
+ }
+ yaml.dump(conf, conffile.open("w"))
+ result = CliRunner().invoke(
+ swhmain, ["provenance", "--config-file", str(conffile), "create", "--drop"]
+ )
+ assert result.exit_code == 0, result.output
+ assert "DeprecationWarning" in result.output
diff --git a/swh/provenance/tests/test_conftest.py b/swh/provenance/tests/test_conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/provenance/tests/test_conftest.py
@@ -0,0 +1,9 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def test_provenance_fixture(provenance):
+ assert provenance
+ provenance.insert_all() # should be a noop

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 10:12 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227609

Event Timeline