diff --git a/swh/provenance/__init__.py b/swh/provenance/__init__.py --- a/swh/provenance/__init__.py +++ b/swh/provenance/__init__.py @@ -73,18 +73,11 @@ if cls == "local": from swh.core.db import BaseDb - from .postgresql.provenancedb_base import ProvenanceDBBase + from .postgresql.provenancedb import ProvenanceDB conn = BaseDb.connect(**kwargs["db"]).conn raise_on_commit = kwargs.get("raise_on_commit", False) - if "with-path" in ProvenanceDBBase(conn, raise_on_commit).flavor: - from .postgresql.provenancedb_with_path import ProvenanceWithPathDB - - return ProvenanceWithPathDB(conn, raise_on_commit) - else: - from .postgresql.provenancedb_without_path import ProvenanceWithoutPathDB - - return ProvenanceWithoutPathDB(conn, raise_on_commit) + return ProvenanceDB(conn, raise_on_commit) elif cls == "remote": from .api.client import RemoteProvenanceStorage diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb.py rename from swh/provenance/postgresql/provenancedb_base.py rename to swh/provenance/postgresql/provenancedb.py --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb.py @@ -24,7 +24,7 @@ ) -class ProvenanceDBBase: +class ProvenanceDB: def __init__( self, conn: psycopg2.extensions.connection, raise_on_commit: bool = False ): @@ -55,13 +55,11 @@ def denormalized(self) -> bool: return "denormalized" in self.flavor - def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - ... - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[ProvenanceResult, None, None]: - ... + def _relation_uses_location_table(self, relation: RelationType) -> bool: + if self.with_path(): + src = relation.value.split("_")[0] + return src in ("content", "directory") + return False def content_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: return self._entity_set_date("content", dates) @@ -119,6 +117,19 @@ def revision_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: return self._entity_set_date("revision", dates) + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: + sql = "select * from swh_provenance_content_find_first(%s)" + self.cursor.execute(sql, (id,)) + row = self.cursor.fetchone() + return ProvenanceResult(**row) if row is not None else None + + def content_find_all( + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[ProvenanceResult, None, None]: + sql = "select * from swh_provenance_content_find_all(%s, %s)" + self.cursor.execute(sql, (id, limit)) + yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) + def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool: try: if origins: @@ -352,6 +363,3 @@ self.cursor.execute(sql, sha1s) result.update(RelationData(**row) for row in self.cursor.fetchall()) return result - - def _relation_uses_location_table(self, relation: RelationType) -> bool: - ... diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py deleted file mode 100644 --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (C) 2021 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from typing import Generator, Optional - -from swh.model.model import Sha1Git - -from ..interface import ProvenanceResult, RelationType -from .provenancedb_base import ProvenanceDBBase - - -class ProvenanceWithPathDB(ProvenanceDBBase): - def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - if self.denormalized: - sql = """ - select C_L.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url AS origin, - L.path as path - from ( - select C.sha1 as sha1, - unnest(revision) as revision, - unnest(location) as location - from content_in_revision as C_R - inner join content as C on (C.id=C_R.content) - where C.sha1=%s) as C_L - inner join revision as R on (R.id=C_L.revision) - inner join location as L on (L.id=C_L.location) - LEFT JOIN origin as O ON (R.origin=O.id) - order by date, revision, origin, path asc limit 1 - """ - else: - sql = """ - SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN location as L ON (CR.location=L.id) - INNER JOIN revision as R ON (CR.revision=R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - WHERE C.sha1=%s - ORDER BY date, revision, origin, path ASC LIMIT 1 - """ - self.cursor.execute(sql, (id,)) - row = self.cursor.fetchone() - return ProvenanceResult(**row) if row is not None else None - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[ProvenanceResult, None, None]: - early_cut = f"LIMIT {limit}" if limit is not None else "" - if self.denormalized: - sql = f""" - (SELECT C_L.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM ( - SELECT C.sha1 AS sha1, - unnest(revision) AS revision, - unnest(location) AS location - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id = C_R.content) - WHERE C.sha1=%s) AS C_L - INNER JOIN revision AS R ON (R.id = C_L.revision) - INNER JOIN location AS L ON (L.id = C_L.location) - LEFT JOIN origin as O ON (R.origin=O.id) - ) - UNION - (WITH - C_D as ( - SELECT C.sha1 AS content_sha1, - unnest(CD.directory) AS directory, - unnest(CD.location) AS location - FROM content AS C - INNER JOIN content_in_directory AS CD ON (CD.content = C.id) - WHERE C.sha1=%s - ), - D_R as ( - SELECT C_D.content_sha1 AS content_sha1, - DL.path AS file_path, - unnest(DR.revision) AS revision, - unnest(DR.location) AS prefix_location - FROM C_D - INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) - INNER JOIN location AS DL ON (DL.id = C_D.location) - ) - SELECT D_R.content_sha1 AS sha1, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - CASE DL.path - WHEN '' THEN D_R.file_path - WHEN '.' THEN D_R.file_path - ELSE (DL.path || '/' || D_R.file_path)::unix_path - END AS path - FROM D_R - INNER JOIN location AS DL ON (D_R.prefix_location = DL.id) - INNER JOIN revision AS R ON (D_R.revision = R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - ) - ORDER BY date, revision, origin, path {early_cut} - """ - else: - sql = f""" - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN location AS L ON (CR.location=L.id) - INNER JOIN revision AS R ON (CR.revision=R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - WHERE C.sha1=%s) - UNION - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - CASE DL.path - WHEN '' THEN CL.path - WHEN '.' THEN CL.path - ELSE (DL.path || '/' || CL.path)::unix_path - END AS path - FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id=CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) - INNER JOIN revision AS R ON (DR.revision=R.id) - INNER JOIN location AS CL ON (CD.location=CL.id) - INNER JOIN location AS DL ON (DR.location=DL.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s) - ORDER BY date, revision, origin, path {early_cut} - """ - self.cursor.execute(sql, (id, id)) - yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) - - def _relation_uses_location_table(self, relation: RelationType) -> bool: - src, *_ = relation.value.split("_") - return src in ("content", "directory") diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py deleted file mode 100644 --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright (C) 2021 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from typing import Generator, Optional - -from swh.model.model import Sha1Git - -from ..provenance import ProvenanceResult, RelationType -from .provenancedb_base import ProvenanceDBBase - - -class ProvenanceWithoutPathDB(ProvenanceDBBase): - def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - if self.denormalized: - sql = """ - select C_L.sha1 as content, - R.sha1 as revision, - R.date as date, - O.url AS origin, - '\\x'::bytea as path - FROM ( - SELECT C.sha1, unnest(revision) AS revision - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id=C_R.content) - WHERE C.sha1=%s - ) AS C_L - INNER JOIN revision as R on (R.id=C_L.revision) - LEFT JOIN origin as O ON (R.origin=O.id) - ORDER BY date, revision, origin ASC LIMIT 1 - """ - else: - sql = """ - SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea AS path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN revision AS R ON (CR.revision = R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - WHERE C.sha1=%s - ORDER BY date, revision, origin ASC LIMIT 1 - """ - - self.cursor.execute(sql, (id,)) - row = self.cursor.fetchone() - return ProvenanceResult(**row) if row is not None else None - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[ProvenanceResult, None, None]: - early_cut = f"LIMIT {limit}" if limit is not None else "" - if self.denormalized: - sql = f""" - (SELECT C_L.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea as path - FROM ( - SELECT C.sha1, unnest(revision) AS revision - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id=C_R.content) - WHERE C.sha1=%s) AS C_L - INNER JOIN revision AS R ON (R.id=C_L.revision) - LEFT JOIN origin as O ON (R.origin=O.id) - ) - UNION - (WITH - C_D as ( - SELECT C.sha1 AS content_sha1, - unnest(CD.directory) AS directory - FROM content AS C - INNER JOIN content_in_directory AS CD ON (CD.content = C.id) - WHERE C.sha1=%s - ), - D_R as ( - SELECT C_D.content_sha1 AS content_sha1, - unnest(DR.revision) AS revision - FROM C_D - INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) - ) - SELECT D_R.content_sha1 as content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea AS path - FROM D_R - INNER JOIN revision AS R ON (D_R.revision = R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - ) - ORDER BY date, revision, path {early_cut} - """ - else: - sql = f""" - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea as path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN revision AS R ON (CR.revision=R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - WHERE C.sha1=%s) - UNION - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea as path - FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id=CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) - INNER JOIN revision AS R ON (DR.revision=R.id) - LEFT JOIN origin as O ON (R.origin=O.id) - WHERE C.sha1=%s) - ORDER BY date, revision, origin {early_cut} - """ - self.cursor.execute(sql, (id, id)) - yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) - - def _relation_uses_location_table(self, relation: RelationType) -> bool: - return False diff --git a/swh/provenance/sql/40-funcs.sql b/swh/provenance/sql/40-funcs.sql new file mode 100644 --- /dev/null +++ b/swh/provenance/sql/40-funcs.sql @@ -0,0 +1,338 @@ +select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset +select position('with-path' in swh_get_dbflavor()::text) != 0 as dbflavor_with_path \gset + +create type relation_row as (src sha1_git, dst sha1_git, loc unix_path); + +\if :dbflavor_norm + +\if :dbflavor_with_path +-- +-- with path and normalized +-- + +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + SELECT C.sha1 AS content, + R.sha1 AS revision, + R.date AS date, + O.url AS origin, + L.path AS path + FROM content AS C + INNER JOIN content_in_revision AS CR ON (CR.content = C.id) + INNER JOIN location as L ON (CR.location = L.id) + INNER JOIN revision as R ON (CR.revision = R.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + WHERE C.sha1=content_id + ORDER BY date, revision, origin, path ASC LIMIT 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (select C.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + l.path as path + from content as c + inner join content_in_revision as cr on (cr.content = c.id) + inner join location as l on (cr.location = l.id) + inner join revision as r on (cr.revision = r.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + where c.sha1=content_id) + union + (select c.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + case dirloc.path + when '' then cntloc.path + when '.' then cntloc.path + else (dirloc.path || '/' || cntloc.path)::unix_path + end as path + from content as c + inner join content_in_directory as cd on (c.id = cd.content) + inner join directory_in_revision as dr on (cd.directory = dr.directory) + inner join revision as r on (dr.revision = r.id) + inner join location as cntloc on (cd.location = cntloc.id) + inner join location as dirloc on (dr.location = dirloc.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + where C.sha1=content_id) + order by date, revision, origin, path limit early_cut +$$; + +\else +-- +-- without path and normalized +-- +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + SELECT C.sha1 AS sha1, + R.sha1 AS revision, + R.date AS date, + O.url as origin, + '\\x'::bytea AS path + FROM content AS C + INNER JOIN content_in_revision AS CR ON (CR.content = C.id) + INNER JOIN revision AS R ON (CR.revision = R.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + WHERE C.sha1=content_id + ORDER BY date, revision, origin ASC LIMIT 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (select C.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::bytea as path + from content as c + inner join content_in_revision as cr on (cr.content = c.id) + inner join revision as r on (cr.revision = r.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + where c.sha1=content_id) + union + (select c.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::bytea as path + from content as c + inner join content_in_directory as cd on (c.id = cd.content) + inner join directory_in_revision as dr on (cd.directory = dr.directory) + inner join revision as r on (dr.revision = r.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + where C.sha1=content_id) + order by date, revision, origin, path limit early_cut +$$; + +-- :dbflavor_with_path +\endif + +-- :dbflavor_norm +\else + +\if :dbflavor_with_path +-- +-- with path and denormalized +-- + +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + select C_L.sha1 as sha1, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from ( + select C.sha1 as sha1, + unnest(revision) as revision, + unnest(location) as location + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=content_id + ) as C_L + inner join revision as R on (R.id=C_L.revision) + inner join location as L on (L.id=C_L.location) + LEFT JOIN origin AS O ON (R.origin=O.id) + order by date, revision, origin, path asc limit 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (with cnt as ( + select c.sha1 as sha1, + unnest(c_r.revision) as revision, + unnest(c_r.location) as location + from content_in_revision as c_r + inner join content as c on (c.id = c_r.content) + where c.sha1 = content_id + ) + select cnt.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + l.path as path + from cnt + inner join revision as r on (r.id = cnt.revision) + inner join location as l on (l.id = cnt.location) + LEFT JOIN origin AS O ON (R.origin=O.id) + ) + union + (with cnt as ( + select c.sha1 as content_sha1, + unnest(cd.directory) as directory, + unnest(cd.location) as location + from content as c + inner join content_in_directory as cd on (cd.content = c.id) + where c.sha1 = content_id + ), + cntdir as ( + select cnt.content_sha1 as content_sha1, + cntloc.path as file_path, + unnest(dr.revision) as revision, + unnest(dr.location) as prefix_location + from cnt + inner join directory_in_revision as dr on (dr.directory = cnt.directory) + inner join location as cntloc on (cntloc.id = cnt.location) + ) + select cntdir.content_sha1 as sha1, + r.sha1 as revision, + r.date as date, + O.url as origin, + case dirloc.path + when '' then cntdir.file_path + when '.' then cntdir.file_path + else (dirloc.path || '/' || cntdir.file_path)::unix_path + end as path + from cntdir + inner join location as dirloc on (cntdir.prefix_location = dirloc.id) + inner join revision as r on (cntdir.revision = r.id) + LEFT JOIN origin AS O ON (R.origin=O.id) + ) + order by date, revision, origin, path limit early_cut +$$; + +\else +-- +-- without path and denormalized +-- +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + select C_L.sha1 as sha1, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\\x'::bytea as path + from ( + select C.sha1, unnest(revision) as revision + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=content_id + ) as C_L + inner join revision as R on (R.id=C_L.revision) + LEFT JOIN origin AS O ON (R.origin=O.id) + order by date, revision, origin, path asc limit 1 +$$; + + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (with cnt as ( + select c.sha1 as sha1, + unnest(c_r.revision) as revision + from content_in_revision as c_r + inner join content as c on (c.id = c_r.content) + where c.sha1 = content_id + ) + select cnt.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::bytea as path + from cnt + inner join revision as r on (r.id = cnt.revision) + LEFT JOIN origin AS O ON (r.origin=O.id) + ) + union + (with cnt as ( + select c.sha1 as content_sha1, + unnest(cd.directory) as directory + from content as c + inner join content_in_directory as cd on (cd.content = c.id) + where c.sha1 = content_id + ), + cntdir as ( + select cnt.content_sha1 as content_sha1, + unnest(dr.revision) as revision + from cnt + inner join directory_in_revision as dr on (dr.directory = cnt.directory) + ) + select cntdir.content_sha1 as sha1, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::bytea as path + from cntdir + inner join revision as r on (cntdir.revision = r.id) + LEFT JOIN origin AS O ON (r.origin=O.id) + ) + order by date, revision, origin, path limit early_cut +$$; + +\endif +-- :dbflavor_with_path +\endif +-- :dbflavor_norm diff --git a/swh/provenance/tests/test_provenance_db.py b/swh/provenance/tests/test_provenance_db.py --- a/swh/provenance/tests/test_provenance_db.py +++ b/swh/provenance/tests/test_provenance_db.py @@ -4,16 +4,13 @@ # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone -from typing import Type from swh.model.model import OriginVisitStatus from swh.model.tests.swh_model_data import TEST_OBJECTS -from swh.provenance.interface import ProvenanceInterface, ProvenanceStorageInterface +from swh.provenance.interface import ProvenanceInterface from swh.provenance.model import OriginEntry from swh.provenance.origin import origin_add -from swh.provenance.postgresql.provenancedb_base import ProvenanceDBBase -from swh.provenance.postgresql.provenancedb_with_path import ProvenanceWithPathDB -from swh.provenance.postgresql.provenancedb_without_path import ProvenanceWithoutPathDB +from swh.provenance.postgresql.provenancedb import ProvenanceDB from swh.provenance.storage.archive import ArchiveStorage from swh.storage.postgresql.storage import Storage @@ -41,16 +38,10 @@ def test_provenance_flavor(provenance: ProvenanceInterface) -> None: - if isinstance(provenance.storage, ProvenanceDBBase): + if isinstance(provenance.storage, ProvenanceDB): assert provenance.storage.flavor in ( "with-path", "without-path", "with-path-denormalized", "without-path-denormalized", ) - backend_class: Type[ProvenanceStorageInterface] - if "with-path" in provenance.storage.flavor: - backend_class = ProvenanceWithPathDB - else: - backend_class = ProvenanceWithoutPathDB - assert isinstance(provenance.storage, backend_class)