diff --git a/swh/provenance/postgresql/provenancedb_base.py b/swh/provenance/postgresql/provenancedb_base.py index 42d6490..2adbb20 100644 --- a/swh/provenance/postgresql/provenancedb_base.py +++ b/swh/provenance/postgresql/provenancedb_base.py @@ -1,357 +1,362 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import itertools import logging from typing import Dict, Generator, Iterable, Optional, Set, Tuple import psycopg2 import psycopg2.extras from typing_extensions import Literal from swh.core.db import BaseDb from swh.model.model import Sha1Git from ..interface import ( EntityType, ProvenanceResult, RelationData, RelationType, RevisionData, ) class ProvenanceDBBase: def __init__( self, conn: psycopg2.extensions.connection, raise_on_commit: bool = False ): BaseDb.adapt_conn(conn) conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) conn.set_session(autocommit=True) self.conn = conn self.cursor = self.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # XXX: not sure this is the best place to do it! sql = "SET timezone TO 'UTC'" self.cursor.execute(sql) self._flavor: Optional[str] = None self.raise_on_commit = raise_on_commit @property def flavor(self) -> str: if self._flavor is None: sql = "SELECT swh_get_dbflavor() AS flavor" self.cursor.execute(sql) self._flavor = self.cursor.fetchone()["flavor"] assert self._flavor is not None return self._flavor def with_path(self) -> bool: return "with-path" in self.flavor @property def denormalized(self) -> bool: return "denormalized" in self.flavor - def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - ... - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[ProvenanceResult, None, None]: - ... - def content_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: return self._entity_set_date("content", dates) def content_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: return self._entity_get_date("content", ids) def directory_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: return self._entity_set_date("directory", dates) def directory_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, datetime]: return self._entity_get_date("directory", ids) def entity_get_all(self, entity: EntityType) -> Set[Sha1Git]: sql = f"SELECT sha1 FROM {entity.value}" self.cursor.execute(sql) return {row["sha1"] for row in self.cursor.fetchall()} def location_get(self) -> Set[bytes]: sql = "SELECT encode(location.path::bytea, 'escape') AS path FROM location" self.cursor.execute(sql) return {row["path"] for row in self.cursor.fetchall()} def origin_set_url(self, urls: Dict[Sha1Git, str]) -> bool: try: if urls: sql = """ LOCK TABLE ONLY origin; INSERT INTO origin(sha1, url) VALUES %s ON CONFLICT DO NOTHING """ psycopg2.extras.execute_values(self.cursor, sql, urls.items()) return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message logging.exception("Unexpected error") if self.raise_on_commit: raise return False def origin_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, str]: urls: Dict[Sha1Git, str] = {} sha1s = tuple(ids) if sha1s: values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT sha1, url FROM origin WHERE sha1 IN ({values}) """ self.cursor.execute(sql, sha1s) urls.update((row["sha1"], row["url"]) for row in self.cursor.fetchall()) return urls def revision_set_date(self, dates: Dict[Sha1Git, datetime]) -> bool: return self._entity_set_date("revision", dates) + def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: + sql = "select * from swh_provenance_content_find_first(%s)" + self.cursor.execute(sql, (id,)) + row = self.cursor.fetchone() + return ProvenanceResult(**row) if row is not None else None + + def content_find_all( + self, id: Sha1Git, limit: Optional[int] = None + ) -> Generator[ProvenanceResult, None, None]: + sql = "select * from swh_provenance_content_find_all(%s, %s)" + self.cursor.execute(sql, (id, limit)) + yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) + def revision_set_origin(self, origins: Dict[Sha1Git, Sha1Git]) -> bool: try: if origins: sql = """ LOCK TABLE ONLY revision; INSERT INTO revision(sha1, origin) (SELECT V.rev AS sha1, O.id AS origin FROM (VALUES %s) AS V(rev, org) JOIN origin AS O ON (O.sha1=V.org)) ON CONFLICT (sha1) DO UPDATE SET origin=EXCLUDED.origin """ psycopg2.extras.execute_values(self.cursor, sql, origins.items()) return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message logging.exception("Unexpected error") if self.raise_on_commit: raise return False def revision_get(self, ids: Iterable[Sha1Git]) -> Dict[Sha1Git, RevisionData]: result: Dict[Sha1Git, RevisionData] = {} sha1s = tuple(ids) if sha1s: values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT sha1, date, origin FROM revision WHERE sha1 IN ({values}) """ self.cursor.execute(sql, sha1s) result.update( (row["sha1"], RevisionData(date=row["date"], origin=row["origin"])) for row in self.cursor.fetchall() ) return result def relation_add( self, relation: RelationType, data: Iterable[RelationData] ) -> bool: try: rows = tuple((rel.src, rel.dst, rel.path) for rel in data) if rows: table = relation.value src, *_, dst = table.split("_") if src != "origin": # Origin entries should be inserted previously as they require extra # non-null information srcs = tuple(set((sha1,) for (sha1, _, _) in rows)) sql = f""" LOCK TABLE ONLY {src}; INSERT INTO {src}(sha1) VALUES %s ON CONFLICT DO NOTHING """ psycopg2.extras.execute_values(self.cursor, sql, srcs) if dst != "origin": # Origin entries should be inserted previously as they require extra # non-null information dsts = tuple(set((sha1,) for (_, sha1, _) in rows)) sql = f""" LOCK TABLE ONLY {dst}; INSERT INTO {dst}(sha1) VALUES %s ON CONFLICT DO NOTHING """ psycopg2.extras.execute_values(self.cursor, sql, dsts) joins = [ f"INNER JOIN {src} AS S ON (S.sha1=V.src)", f"INNER JOIN {dst} AS D ON (D.sha1=V.dst)", ] nope = (RelationType.REV_BEFORE_REV, RelationType.REV_IN_ORG) selected = ["S.id"] if self.denormalized and relation not in nope: selected.append("ARRAY_AGG(D.id)") else: selected.append("D.id") if self._relation_uses_location_table(relation): locations = tuple(set((path,) for (_, _, path) in rows)) sql = """ LOCK TABLE ONLY location; INSERT INTO location(path) VALUES %s ON CONFLICT (path) DO NOTHING """ psycopg2.extras.execute_values(self.cursor, sql, locations) joins.append("INNER JOIN location AS L ON (L.path=V.path)") if self.denormalized: selected.append("ARRAY_AGG(L.id)") else: selected.append("L.id") sql_l = [ f"INSERT INTO {table}", f" SELECT {', '.join(selected)}", " FROM (VALUES %s) AS V(src, dst, path)", *joins, ] if self.denormalized and relation not in nope: sql_l.append("GROUP BY S.id") sql_l.append( f"""ON CONFLICT ({src}) DO UPDATE SET {dst}=ARRAY( SELECT UNNEST({table}.{dst} || excluded.{dst})), location=ARRAY( SELECT UNNEST({relation.value}.location || excluded.location)) """ ) else: sql_l.append("ON CONFLICT DO NOTHING") sql = "\n".join(sql_l) psycopg2.extras.execute_values(self.cursor, sql, rows) return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message logging.exception("Unexpected error") if self.raise_on_commit: raise return False def relation_get( self, relation: RelationType, ids: Iterable[Sha1Git], reverse: bool = False ) -> Set[RelationData]: return self._relation_get(relation, ids, reverse) def relation_get_all(self, relation: RelationType) -> Set[RelationData]: return self._relation_get(relation, None) def _entity_get_date( self, entity: Literal["content", "directory", "revision"], ids: Iterable[Sha1Git], ) -> Dict[Sha1Git, datetime]: dates: Dict[Sha1Git, datetime] = {} sha1s = tuple(ids) if sha1s: values = ", ".join(itertools.repeat("%s", len(sha1s))) sql = f""" SELECT sha1, date FROM {entity} WHERE sha1 IN ({values}) """ self.cursor.execute(sql, sha1s) dates.update((row["sha1"], row["date"]) for row in self.cursor.fetchall()) return dates def _entity_set_date( self, entity: Literal["content", "directory", "revision"], data: Dict[Sha1Git, datetime], ) -> bool: try: if data: sql = f""" LOCK TABLE ONLY {entity}; INSERT INTO {entity}(sha1, date) VALUES %s ON CONFLICT (sha1) DO UPDATE SET date=LEAST(EXCLUDED.date,{entity}.date) """ psycopg2.extras.execute_values(self.cursor, sql, data.items()) return True except: # noqa: E722 # Unexpected error occurred, rollback all changes and log message logging.exception("Unexpected error") if self.raise_on_commit: raise return False def _relation_get( self, relation: RelationType, ids: Optional[Iterable[Sha1Git]], reverse: bool = False, ) -> Set[RelationData]: result: Set[RelationData] = set() sha1s: Optional[Tuple[Tuple[Sha1Git, ...]]] if ids is not None: sha1s = (tuple(ids),) where = f"WHERE {'S' if not reverse else 'D'}.sha1 IN %s" else: sha1s = None where = "" aggreg_dst = self.denormalized and relation in ( RelationType.CNT_EARLY_IN_REV, RelationType.CNT_IN_DIR, RelationType.DIR_IN_REV, ) if sha1s is None or sha1s[0]: table = relation.value src, *_, dst = table.split("_") # TODO: improve this! if src == "revision" and dst == "revision": src_field = "prev" dst_field = "next" else: src_field = src dst_field = dst if aggreg_dst: revloc = f"UNNEST(R.{dst_field}) AS dst" if self._relation_uses_location_table(relation): revloc += ", UNNEST(R.location) AS path" else: revloc = f"R.{dst_field} AS dst" if self._relation_uses_location_table(relation): revloc += ", R.location AS path" inner_sql = f""" SELECT S.sha1 AS src, {revloc} FROM {table} AS R INNER JOIN {src} AS S ON (S.id=R.{src_field}) {where} """ if self._relation_uses_location_table(relation): loc = "L.path AS path" else: loc = "NULL AS path" sql = f""" SELECT CL.src, D.sha1 AS dst, {loc} FROM ({inner_sql}) AS CL INNER JOIN {dst} AS D ON (D.id=CL.dst) """ if self._relation_uses_location_table(relation): sql += "INNER JOIN location AS L ON (L.id=CL.path)" self.cursor.execute(sql, sha1s) result.update(RelationData(**row) for row in self.cursor.fetchall()) return result def _relation_uses_location_table(self, relation: RelationType) -> bool: ... diff --git a/swh/provenance/postgresql/provenancedb_with_path.py b/swh/provenance/postgresql/provenancedb_with_path.py index 6ec25a1..e34f382 100644 --- a/swh/provenance/postgresql/provenancedb_with_path.py +++ b/swh/provenance/postgresql/provenancedb_with_path.py @@ -1,149 +1,13 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Generator, Optional - -from swh.model.model import Sha1Git - -from ..interface import ProvenanceResult, RelationType +from ..interface import RelationType from .provenancedb_base import ProvenanceDBBase class ProvenanceWithPathDB(ProvenanceDBBase): - def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - if self.denormalized: - sql = """ - SELECT C_L.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM ( - sELECT C.sha1 AS sha1, - UNNEST(revision) AS revision, - UNNEST(location) AS location - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id=C_R.content) - WHERE C.sha1=%s) AS C_L - INNER JOIN revision AS R ON (R.id=C_L.revision) - INNER JOIN location AS L ON (L.id=C_L.location) - LEFT JOIN origin AS O ON (R.origin=O.id) - ORDER BY date, revision, origin, path ASC LIMIT 1 - """ - else: - sql = """ - SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN location as L ON (CR.location=L.id) - INNER JOIN revision as R ON (CR.revision=R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s - ORDER BY date, revision, origin, path ASC LIMIT 1 - """ - self.cursor.execute(sql, (id,)) - row = self.cursor.fetchone() - return ProvenanceResult(**row) if row is not None else None - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[ProvenanceResult, None, None]: - early_cut = f"LIMIT {limit}" if limit is not None else "" - if self.denormalized: - sql = f""" - (SELECT C_L.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM ( - SELECT C.sha1 AS sha1, - unnest(revision) AS revision, - unnest(location) AS location - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id = C_R.content) - WHERE C.sha1=%s) AS C_L - INNER JOIN revision AS R ON (R.id = C_L.revision) - INNER JOIN location AS L ON (L.id = C_L.location) - LEFT JOIN origin AS O ON (R.origin=O.id) - ) - UNION - (WITH - C_D as ( - SELECT C.sha1 AS content_sha1, - unnest(CD.directory) AS directory, - unnest(CD.location) AS location - FROM content AS C - INNER JOIN content_in_directory AS CD ON (CD.content = C.id) - WHERE C.sha1=%s - ), - D_R as ( - SELECT C_D.content_sha1 AS content_sha1, - DL.path AS file_path, - unnest(DR.revision) AS revision, - unnest(DR.location) AS prefix_location - FROM C_D - INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) - INNER JOIN location AS DL ON (DL.id = C_D.location) - ) - SELECT D_R.content_sha1 AS sha1, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - CASE DL.path - WHEN '' THEN D_R.file_path - WHEN '.' THEN D_R.file_path - ELSE (DL.path || '/' || D_R.file_path)::unix_path - END AS path - FROM D_R - INNER JOIN location AS DL ON (D_R.prefix_location = DL.id) - INNER JOIN revision AS R ON (D_R.revision = R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - ) - ORDER BY date, revision, origin, path {early_cut} - """ - else: - sql = f""" - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - L.path AS path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN location AS L ON (CR.location=L.id) - INNER JOIN revision AS R ON (CR.revision=R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s) - UNION - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - CASE DL.path - WHEN '' THEN CL.path - WHEN '.' THEN CL.path - ELSE (DL.path || '/' || CL.path)::unix_path - END AS path - FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id=CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) - INNER JOIN revision AS R ON (DR.revision=R.id) - INNER JOIN location AS CL ON (CD.location=CL.id) - INNER JOIN location AS DL ON (DR.location=DL.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s) - ORDER BY date, revision, origin, path {early_cut} - """ - self.cursor.execute(sql, (id, id)) - yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) - def _relation_uses_location_table(self, relation: RelationType) -> bool: src, *_ = relation.value.split("_") return src in ("content", "directory") diff --git a/swh/provenance/postgresql/provenancedb_without_path.py b/swh/provenance/postgresql/provenancedb_without_path.py index 3d5f323..7713d2b 100644 --- a/swh/provenance/postgresql/provenancedb_without_path.py +++ b/swh/provenance/postgresql/provenancedb_without_path.py @@ -1,127 +1,12 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Generator, Optional - -from swh.model.model import Sha1Git - -from ..provenance import ProvenanceResult, RelationType +from ..provenance import RelationType from .provenancedb_base import ProvenanceDBBase class ProvenanceWithoutPathDB(ProvenanceDBBase): - def content_find_first(self, id: Sha1Git) -> Optional[ProvenanceResult]: - if self.denormalized: - sql = """ - SELECT C_L.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea AS path - FROM ( - SELECT C.sha1, UNNEST(revision) AS revision - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id=C_R.content) - WHERE C.sha1=%s - ) AS C_L - INNER JOIN revision AS R ON (R.id=C_L.revision) - LEFT JOIN origin AS O ON (R.origin=O.id) - ORDER BY date, revision, origin ASC LIMIT 1 - """ - else: - sql = """ - SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea AS path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content = C.id) - INNER JOIN revision AS R ON (CR.revision = R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s - ORDER BY date, revision, origin ASC LIMIT 1 - """ - - self.cursor.execute(sql, (id,)) - row = self.cursor.fetchone() - return ProvenanceResult(**row) if row is not None else None - - def content_find_all( - self, id: Sha1Git, limit: Optional[int] = None - ) -> Generator[ProvenanceResult, None, None]: - early_cut = f"LIMIT {limit}" if limit is not None else "" - if self.denormalized: - sql = f""" - (SELECT C_L.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea as path - FROM ( - SELECT C.sha1, UNNEST(revision) AS revision - FROM content_in_revision AS C_R - INNER JOIN content AS C ON (C.id=C_R.content) - WHERE C.sha1=%s) AS C_L - INNER JOIN revision AS R ON (R.id=C_L.revision) - LEFT JOIN origin AS O ON (R.origin=O.id) - ) - UNION - (WITH - C_D AS ( - SELECT C.sha1 AS content_sha1, - unnest(CD.directory) AS directory - FROM content AS C - INNER JOIN content_in_directory AS CD ON (CD.content = C.id) - WHERE C.sha1=%s - ), - D_R AS ( - SELECT C_D.content_sha1 AS content_sha1, - UNNEST(DR.revision) AS revision - FROM C_D - INNER JOIN directory_in_revision AS DR ON (DR.directory = C_D.directory) - ) - SELECT D_R.content_sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea AS path - FROM D_R - INNER JOIN revision AS R ON (D_R.revision = R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - ) - ORDER BY date, revision, path {early_cut} - """ - else: - sql = f""" - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea as path - FROM content AS C - INNER JOIN content_in_revision AS CR ON (CR.content=C.id) - INNER JOIN revision AS R ON (CR.revision=R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s) - UNION - (SELECT C.sha1 AS content, - R.sha1 AS revision, - R.date AS date, - O.url AS origin, - '\\x'::bytea AS path - FROM content AS C - INNER JOIN content_in_directory AS CD ON (C.id=CD.content) - INNER JOIN directory_in_revision AS DR ON (CD.directory=DR.directory) - INNER JOIN revision AS R ON (DR.revision=R.id) - LEFT JOIN origin AS O ON (R.origin=O.id) - WHERE C.sha1=%s) - ORDER BY date, revision, origin {early_cut} - """ - self.cursor.execute(sql, (id, id)) - yield from (ProvenanceResult(**row) for row in self.cursor.fetchall()) - def _relation_uses_location_table(self, relation: RelationType) -> bool: return False diff --git a/swh/provenance/sql/40-funcs.sql b/swh/provenance/sql/40-funcs.sql new file mode 100644 index 0000000..8e1636a --- /dev/null +++ b/swh/provenance/sql/40-funcs.sql @@ -0,0 +1,338 @@ +select position('denormalized' in swh_get_dbflavor()::text) = 0 as dbflavor_norm \gset +select position('with-path' in swh_get_dbflavor()::text) != 0 as dbflavor_with_path \gset + +create type relation_row as (src sha1_git, dst sha1_git, loc unix_path); + +\if :dbflavor_norm + +\if :dbflavor_with_path +-- +-- with path and normalized +-- + +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from content as C + inner join content_in_revision as CR on (CR.content = C.id) + inner join location as L on (CR.location = L.id) + inner join revision as R on (CR.revision = R.id) + left join origin as O on (R.origin=O.id) + where C.sha1=content_id + order by date, revision, origin, path asc limit 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (select C.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + l.path as path + from content as c + inner join content_in_revision as cr on (cr.content = c.id) + inner join location as l on (cr.location = l.id) + inner join revision as r on (cr.revision = r.id) + left join origin AS O on (R.origin=O.id) + where c.sha1=content_id) + union + (select c.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + case dirloc.path + when '' then cntloc.path + when '.' then cntloc.path + else (dirloc.path || '/' || cntloc.path)::unix_path + end as path + from content as c + inner join content_in_directory as cd on (c.id = cd.content) + inner join directory_in_revision as dr on (cd.directory = dr.directory) + inner join revision as r on (dr.revision = r.id) + inner join location as cntloc on (cd.location = cntloc.id) + inner join location as dirloc on (dr.location = dirloc.id) + left join origin as O on (R.origin=O.id) + where C.sha1=content_id) + order by date, revision, origin, path limit early_cut +$$; + +\else +-- +-- without path and normalized +-- +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + select C.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from content as C + inner join content_in_revision as CR on (CR.content = C.id) + inner join revision as R on (CR.revision = R.id) + left join origin as O on (R.origin=O.id) + where C.sha1=content_id + order by date, revision, origin asc limit 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (select C.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::unix_path as path + from content as c + inner join content_in_revision as cr on (cr.content = c.id) + inner join revision as r on (cr.revision = r.id) + left join origin as O on (R.origin=O.id) + where c.sha1=content_id) + union + (select c.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::unix_path as path + from content as c + inner join content_in_directory as cd on (c.id = cd.content) + inner join directory_in_revision as dr on (cd.directory = dr.directory) + inner join revision as r on (dr.revision = r.id) + left join origin as O on (R.origin=O.id) + where C.sha1=content_id) + order by date, revision, origin, path limit early_cut +$$; + +-- :dbflavor_with_path +\endif + +-- :dbflavor_norm +\else + +\if :dbflavor_with_path +-- +-- with path and denormalized +-- + +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + select C_L.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + L.path as path + from ( + select C.sha1 as sha1, + unnest(revision) as revision, + unnest(location) as location + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=content_id + ) as C_L + inner join revision as R on (R.id=C_L.revision) + inner join location as L on (L.id=C_L.location) + left join origin as O on (R.origin=O.id) + order by date, revision, origin, path asc limit 1 +$$; + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (with cnt as ( + select c.sha1 as sha1, + unnest(c_r.revision) as revision, + unnest(c_r.location) as location + from content_in_revision as c_r + inner join content as c on (c.id = c_r.content) + where c.sha1 = content_id + ) + select cnt.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + l.path as path + from cnt + inner join revision as r on (r.id = cnt.revision) + inner join location as l on (l.id = cnt.location) + left join origin as O on (R.origin=O.id) + ) + union + (with cnt as ( + select c.sha1 as content_sha1, + unnest(cd.directory) as directory, + unnest(cd.location) as location + from content as c + inner join content_in_directory as cd on (cd.content = c.id) + where c.sha1 = content_id + ), + cntdir as ( + select cnt.content_sha1 as content_sha1, + cntloc.path as file_path, + unnest(dr.revision) as revision, + unnest(dr.location) as prefix_location + from cnt + inner join directory_in_revision as dr on (dr.directory = cnt.directory) + inner join location as cntloc on (cntloc.id = cnt.location) + ) + select cntdir.content_sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + case dirloc.path + when '' then cntdir.file_path + when '.' then cntdir.file_path + else (dirloc.path || '/' || cntdir.file_path)::unix_path + end as path + from cntdir + inner join location as dirloc on (cntdir.prefix_location = dirloc.id) + inner join revision as r on (cntdir.revision = r.id) + left join origin as O on (R.origin=O.id) + ) + order by date, revision, origin, path limit early_cut +$$; + +\else +-- +-- without path and denormalized +-- +create or replace function swh_provenance_content_find_first(content_id sha1_git) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + select C_L.sha1 as content, + R.sha1 as revision, + R.date as date, + O.url as origin, + '\x'::unix_path as path + from ( + select C.sha1, unnest(revision) as revision + from content_in_revision as C_R + inner join content as C on (C.id=C_R.content) + where C.sha1=content_id + ) as C_L + inner join revision as R on (R.id=C_L.revision) + left join origin as O on (R.origin=O.id) + order by date, revision, origin, path asc limit 1 +$$; + + +create or replace function swh_provenance_content_find_all(content_id sha1_git, early_cut int) + returns table ( + content sha1_git, + revision sha1_git, + date timestamptz, + origin unix_path, + path unix_path + ) + language sql + stable +as $$ + (with cnt as ( + select c.sha1 as sha1, + unnest(c_r.revision) as revision + from content_in_revision as c_r + inner join content as c on (c.id = c_r.content) + where c.sha1 = content_id + ) + select cnt.sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::unix_path as path + from cnt + inner join revision as r on (r.id = cnt.revision) + left join origin as O on (r.origin=O.id) + ) + union + (with cnt as ( + select c.sha1 as content_sha1, + unnest(cd.directory) as directory + from content as c + inner join content_in_directory as cd on (cd.content = c.id) + where c.sha1 = content_id + ), + cntdir as ( + select cnt.content_sha1 as content_sha1, + unnest(dr.revision) as revision + from cnt + inner join directory_in_revision as dr on (dr.directory = cnt.directory) + ) + select cntdir.content_sha1 as content, + r.sha1 as revision, + r.date as date, + O.url as origin, + '\x'::unix_path as path + from cntdir + inner join revision as r on (cntdir.revision = r.id) + left join origin as O on (r.origin=O.id) + ) + order by date, revision, origin, path limit early_cut +$$; + +\endif +-- :dbflavor_with_path +\endif +-- :dbflavor_norm