diff --git a/sql/upgrades/145.sql b/sql/upgrades/145.sql new file mode 100644 index 00000000..65686731 --- /dev/null +++ b/sql/upgrades/145.sql @@ -0,0 +1,9 @@ +-- SWH DB schema upgrade +-- from_version: 144 +-- to_version: 145 +-- description: Improve query on origin_visit + +insert into dbversion(version, release, description) + values(145, now(), 'Work In Progress'); + +create index concurrently on origin_visit(type, status, date); diff --git a/swh/storage/db.py b/swh/storage/db.py index 7441c2b9..086bded2 100644 --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -1,912 +1,917 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import select from swh.core.db import BaseDb from swh.core.db.db_utils import stored_procedure, jsonize from swh.core.db.db_utils import execute_values_generator class Db(BaseDb): """Proxy to the SWH DB, with wrappers around stored procedures """ def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)', (('directory_entry_%s' % entry_type),)) @stored_procedure('swh_mktemp_revision') def mktemp_revision(self, cur=None): pass @stored_procedure('swh_mktemp_release') def mktemp_release(self, cur=None): pass @stored_procedure('swh_mktemp_snapshot_branch') def mktemp_snapshot_branch(self, cur=None): pass def register_listener(self, notify_queue, cur=None): """Register a listener for NOTIFY queue `notify_queue`""" self._cursor(cur).execute("LISTEN %s" % notify_queue) def listen_notifies(self, timeout): """Listen to notifications for `timeout` seconds""" if select.select([self.conn], [], [], timeout) == ([], [], []): return else: self.conn.poll() while self.conn.notifies: yield self.conn.notifies.pop(0) @stored_procedure('swh_content_add') def content_add_from_temp(self, cur=None): pass @stored_procedure('swh_directory_add') def directory_add_from_temp(self, cur=None): pass @stored_procedure('swh_skipped_content_add') def skipped_content_add_from_temp(self, cur=None): pass @stored_procedure('swh_revision_add') def revision_add_from_temp(self, cur=None): pass @stored_procedure('swh_release_add') def release_add_from_temp(self, cur=None): pass def content_update_from_temp(self, keys_to_update, cur=None): cur = self._cursor(cur) cur.execute("""select swh_content_update(ARRAY[%s] :: text[])""" % keys_to_update) content_get_metadata_keys = [ 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'status'] content_add_keys = content_get_metadata_keys + ['ctime'] skipped_content_keys = [ 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'reason', 'status', 'origin'] def content_get_metadata_from_sha1s(self, sha1s, cur=None): cur = self._cursor(cur) yield from execute_values_generator( cur, """ select t.sha1, %s from (values %%s) as t (sha1) left join content using (sha1) """ % ', '.join(self.content_get_metadata_keys[1:]), ((sha1,) for sha1 in sha1s), ) def content_get_range(self, start, end, limit=None, cur=None): """Retrieve contents within range [start, end]. """ cur = self._cursor(cur) query = """select %s from content where %%s <= sha1 and sha1 <= %%s order by sha1 limit %%s""" % ', '.join(self.content_get_metadata_keys) cur.execute(query, (start, end, limit)) yield from cur content_hash_keys = ['sha1', 'sha1_git', 'sha256', 'blake2s256'] def content_missing_from_list(self, contents, cur=None): cur = self._cursor(cur) keys = ', '.join(self.content_hash_keys) equality = ' AND '.join( ('t.%s = c.%s' % (key, key)) for key in self.content_hash_keys ) yield from execute_values_generator( cur, """ SELECT %s FROM (VALUES %%s) as t(%s) WHERE NOT EXISTS ( SELECT 1 FROM content c WHERE %s ) """ % (keys, keys, equality), (tuple(c[key] for key in self.content_hash_keys) for c in contents) ) def content_missing_per_sha1(self, sha1s, cur=None): cur = self._cursor(cur) yield from execute_values_generator(cur, """ SELECT t.sha1 FROM (VALUES %s) AS t(sha1) WHERE NOT EXISTS ( SELECT 1 FROM content c WHERE c.sha1 = t.sha1 )""", ((sha1,) for sha1 in sha1s)) def skipped_content_missing(self, contents, cur=None): if not contents: return [] cur = self._cursor(cur) query = """SELECT * FROM (VALUES %s) AS t (%s) WHERE not exists (SELECT 1 FROM skipped_content s WHERE s.sha1 is not distinct from t.sha1 and s.sha1_git is not distinct from t.sha1_git and s.sha256 is not distinct from t.sha256);""" % \ ((', '.join('%s' for _ in contents)), ', '.join(self.content_hash_keys)) cur.execute(query, [tuple(cont[key] for key in self.content_hash_keys) for cont in contents]) yield from cur def snapshot_exists(self, snapshot_id, cur=None): """Check whether a snapshot with the given id exists""" cur = self._cursor(cur) cur.execute("""SELECT 1 FROM snapshot where id=%s""", (snapshot_id,)) return bool(cur.fetchone()) def snapshot_add(self, snapshot_id, cur=None): """Add a snapshot from the temporary table""" cur = self._cursor(cur) cur.execute("""SELECT swh_snapshot_add(%s)""", (snapshot_id,)) snapshot_count_cols = ['target_type', 'count'] def snapshot_count_branches(self, snapshot_id, cur=None): cur = self._cursor(cur) query = """\ SELECT %s FROM swh_snapshot_count_branches(%%s) """ % ', '.join(self.snapshot_count_cols) cur.execute(query, (snapshot_id,)) yield from cur snapshot_get_cols = ['snapshot_id', 'name', 'target', 'target_type'] def snapshot_get_by_id(self, snapshot_id, branches_from=b'', branches_count=None, target_types=None, cur=None): cur = self._cursor(cur) query = """\ SELECT %s FROM swh_snapshot_get_by_id(%%s, %%s, %%s, %%s :: snapshot_target[]) """ % ', '.join(self.snapshot_get_cols) cur.execute(query, (snapshot_id, branches_from, branches_count, target_types)) yield from cur def snapshot_get_by_origin_visit(self, origin_url, visit_id, cur=None): cur = self._cursor(cur) query = """\ SELECT snapshot FROM origin_visit INNER JOIN origin ON origin.id = origin_visit.origin WHERE origin.url=%s AND origin_visit.visit=%s; """ cur.execute(query, (origin_url, visit_id)) ret = cur.fetchone() if ret: return ret[0] content_find_cols = ['sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'ctime', 'status'] def content_find(self, sha1=None, sha1_git=None, sha256=None, blake2s256=None, cur=None): """Find the content optionally on a combination of the following checksums sha1, sha1_git, sha256 or blake2s256. Args: sha1: sha1 content git_sha1: the sha1 computed `a la git` sha1 of the content sha256: sha256 content blake2s256: blake2s256 content Returns: The tuple (sha1, sha1_git, sha256, blake2s256) if found or None. """ cur = self._cursor(cur) checksum_dict = {'sha1': sha1, 'sha1_git': sha1_git, 'sha256': sha256, 'blake2s256': blake2s256} where_parts = [] args = [] # Adds only those keys which have value other than None for algorithm in checksum_dict: if checksum_dict[algorithm] is not None: args.append(checksum_dict[algorithm]) where_parts.append(algorithm + '= %s') query = ' AND '.join(where_parts) cur.execute("""SELECT %s FROM content WHERE %s """ % (','.join(self.content_find_cols), query), args) content = cur.fetchall() return content def directory_missing_from_list(self, directories, cur=None): cur = self._cursor(cur) yield from execute_values_generator( cur, """ SELECT id FROM (VALUES %s) as t(id) WHERE NOT EXISTS ( SELECT 1 FROM directory d WHERE d.id = t.id ) """, ((id,) for id in directories)) directory_ls_cols = ['dir_id', 'type', 'target', 'name', 'perms', 'status', 'sha1', 'sha1_git', 'sha256', 'length'] def directory_walk_one(self, directory, cur=None): cur = self._cursor(cur) cols = ', '.join(self.directory_ls_cols) query = 'SELECT %s FROM swh_directory_walk_one(%%s)' % cols cur.execute(query, (directory,)) yield from cur def directory_walk(self, directory, cur=None): cur = self._cursor(cur) cols = ', '.join(self.directory_ls_cols) query = 'SELECT %s FROM swh_directory_walk(%%s)' % cols cur.execute(query, (directory,)) yield from cur def directory_entry_get_by_path(self, directory, paths, cur=None): """Retrieve a directory entry by path. """ cur = self._cursor(cur) cols = ', '.join(self.directory_ls_cols) query = ( 'SELECT %s FROM swh_find_directory_entry_by_path(%%s, %%s)' % cols) cur.execute(query, (directory, paths)) data = cur.fetchone() if set(data) == {None}: return None return data def revision_missing_from_list(self, revisions, cur=None): cur = self._cursor(cur) yield from execute_values_generator( cur, """ SELECT id FROM (VALUES %s) as t(id) WHERE NOT EXISTS ( SELECT 1 FROM revision r WHERE r.id = t.id ) """, ((id,) for id in revisions)) revision_add_cols = [ 'id', 'date', 'date_offset', 'date_neg_utc_offset', 'committer_date', 'committer_date_offset', 'committer_date_neg_utc_offset', 'type', 'directory', 'message', 'author_fullname', 'author_name', 'author_email', 'committer_fullname', 'committer_name', 'committer_email', 'metadata', 'synthetic', ] revision_get_cols = revision_add_cols + ['parents'] def origin_visit_add(self, origin, ts, type, cur=None): """Add a new origin_visit for origin origin at timestamp ts with status 'ongoing'. Args: origin: origin concerned by the visit ts: the date of the visit type: type of loader for the visit Returns: The new visit index step for that origin """ cur = self._cursor(cur) self._cursor(cur).execute('SELECT swh_origin_visit_add(%s, %s, %s)', (origin, ts, type)) return cur.fetchone()[0] def origin_visit_update(self, origin_id, visit_id, updates, cur=None): """Update origin_visit's status.""" cur = self._cursor(cur) update_cols = [] values = [] where = ['origin.id = origin_visit.origin', 'origin.url=%s', 'visit=%s'] where_values = [origin_id, visit_id] if 'status' in updates: update_cols.append('status=%s') values.append(updates.pop('status')) if 'metadata' in updates: update_cols.append('metadata=%s') values.append(jsonize(updates.pop('metadata'))) if 'snapshot' in updates: update_cols.append('snapshot=%s') values.append(updates.pop('snapshot')) assert not updates, 'Unknown fields: %r' % updates query = """UPDATE origin_visit SET {update_cols} FROM origin WHERE {where}""".format(**{ 'update_cols': ', '.join(update_cols), 'where': ' AND '.join(where) }) cur.execute(query, (*values, *where_values)) def origin_visit_upsert(self, origin, visit, date, type, status, metadata, snapshot, cur=None): # doing an extra query like this is way simpler than trying to join # the origin id in the query below origin_id = next(self.origin_id_get_by_url([origin])) cur = self._cursor(cur) query = """INSERT INTO origin_visit ({cols}) VALUES ({values}) ON CONFLICT ON CONSTRAINT origin_visit_pkey DO UPDATE SET {updates}""".format( cols=', '.join(self.origin_visit_get_cols), values=', '.join('%s' for col in self.origin_visit_get_cols), updates=', '.join('{0}=excluded.{0}'.format(col) for col in self.origin_visit_get_cols)) cur.execute( query, (origin_id, visit, date, type, status, metadata, snapshot)) origin_visit_get_cols = [ 'origin', 'visit', 'date', 'type', 'status', 'metadata', 'snapshot'] origin_visit_select_cols = [ 'origin.url AS origin', 'visit', 'date', 'origin_visit.type AS type', 'status', 'metadata', 'snapshot'] def origin_visit_get_all(self, origin_id, last_visit=None, limit=None, cur=None): """Retrieve all visits for origin with id origin_id. Args: origin_id: The occurrence's origin Yields: The occurrence's history visits """ cur = self._cursor(cur) if last_visit: extra_condition = 'and visit > %s' args = (origin_id, last_visit, limit) else: extra_condition = '' args = (origin_id, limit) query = """\ SELECT %s FROM origin_visit INNER JOIN origin ON origin.id = origin_visit.origin WHERE origin.url=%%s %s order by visit asc limit %%s""" % ( ', '.join(self.origin_visit_select_cols), extra_condition ) cur.execute(query, args) yield from cur def origin_visit_get(self, origin_id, visit_id, cur=None): """Retrieve information on visit visit_id of origin origin_id. Args: origin_id: the origin concerned visit_id: The visit step for that origin Returns: The origin_visit information """ cur = self._cursor(cur) query = """\ SELECT %s FROM origin_visit INNER JOIN origin ON origin.id = origin_visit.origin WHERE origin.url = %%s AND visit = %%s """ % (', '.join(self.origin_visit_select_cols)) cur.execute(query, (origin_id, visit_id)) r = cur.fetchall() if not r: return None return r[0] def origin_visit_find_by_date(self, origin, visit_date, cur=None): cur = self._cursor(cur) cur.execute( 'SELECT * FROM swh_visit_find_by_date(%s, %s)', (origin, visit_date)) r = cur.fetchall() if r: return r[0] def origin_visit_exists(self, origin_id, visit_id, cur=None): """Check whether an origin visit with the given ids exists""" cur = self._cursor(cur) query = "SELECT 1 FROM origin_visit where origin = %s AND visit = %s" cur.execute(query, (origin_id, visit_id)) return bool(cur.fetchone()) def origin_visit_get_latest( self, origin_id, allowed_statuses=None, require_snapshot=False, cur=None): """Retrieve the most recent origin_visit of the given origin, with optional filters. Args: origin_id: the origin concerned allowed_statuses: the visit statuses allowed for the returned visit require_snapshot (bool): If True, only a visit with a known snapshot will be returned. Returns: The origin_visit information, or None if no visit matches. """ cur = self._cursor(cur) query_parts = [ 'SELECT %s' % ', '.join(self.origin_visit_select_cols), 'FROM origin_visit', 'INNER JOIN origin ON origin.id = origin_visit.origin'] query_parts.append('WHERE origin.url = %s') if require_snapshot: query_parts.append('AND snapshot is not null') if allowed_statuses: query_parts.append( cur.mogrify('AND status IN %s', (tuple(allowed_statuses),)).decode()) query_parts.append('ORDER BY date DESC, visit DESC LIMIT 1') query = '\n'.join(query_parts) cur.execute(query, (origin_id,)) r = cur.fetchone() if not r: return None return r @staticmethod def mangle_query_key(key, main_table): if key == 'id': return 't.id' if key == 'parents': return ''' ARRAY( SELECT rh.parent_id::bytea FROM revision_history rh WHERE rh.id = t.id ORDER BY rh.parent_rank )''' if '_' not in key: return '%s.%s' % (main_table, key) head, tail = key.split('_', 1) if (head in ('author', 'committer') and tail in ('name', 'email', 'id', 'fullname')): return '%s.%s' % (head, tail) return '%s.%s' % (main_table, key) def revision_get_from_list(self, revisions, cur=None): cur = self._cursor(cur) query_keys = ', '.join( self.mangle_query_key(k, 'revision') for k in self.revision_get_cols ) yield from execute_values_generator( cur, """ SELECT %s FROM (VALUES %%s) as t(id) LEFT JOIN revision ON t.id = revision.id LEFT JOIN person author ON revision.author = author.id LEFT JOIN person committer ON revision.committer = committer.id """ % query_keys, ((id,) for id in revisions)) def revision_log(self, root_revisions, limit=None, cur=None): cur = self._cursor(cur) query = """SELECT %s FROM swh_revision_log(%%s, %%s) """ % ', '.join(self.revision_get_cols) cur.execute(query, (root_revisions, limit)) yield from cur revision_shortlog_cols = ['id', 'parents'] def revision_shortlog(self, root_revisions, limit=None, cur=None): cur = self._cursor(cur) query = """SELECT %s FROM swh_revision_list(%%s, %%s) """ % ', '.join(self.revision_shortlog_cols) cur.execute(query, (root_revisions, limit)) yield from cur def release_missing_from_list(self, releases, cur=None): cur = self._cursor(cur) yield from execute_values_generator( cur, """ SELECT id FROM (VALUES %s) as t(id) WHERE NOT EXISTS ( SELECT 1 FROM release r WHERE r.id = t.id ) """, ((id,) for id in releases)) object_find_by_sha1_git_cols = ['sha1_git', 'type', 'id', 'object_id'] def object_find_by_sha1_git(self, ids, cur=None): cur = self._cursor(cur) yield from execute_values_generator( cur, """ WITH t (id) AS (VALUES %s), known_objects as (( select id as sha1_git, 'release'::object_type as type, id, object_id from release r where exists (select 1 from t where t.id = r.id) ) union all ( select id as sha1_git, 'revision'::object_type as type, id, object_id from revision r where exists (select 1 from t where t.id = r.id) ) union all ( select id as sha1_git, 'directory'::object_type as type, id, object_id from directory d where exists (select 1 from t where t.id = d.id) ) union all ( select sha1_git as sha1_git, 'content'::object_type as type, sha1 as id, object_id from content c where exists (select 1 from t where t.id = c.sha1_git) )) select t.id as sha1_git, k.type, k.id, k.object_id from t left join known_objects k on t.id = k.sha1_git """, ((id,) for id in ids) ) def stat_counters(self, cur=None): cur = self._cursor(cur) cur.execute('SELECT * FROM swh_stat_counters()') yield from cur def origin_add(self, url, cur=None): """Insert a new origin and return the new identifier.""" insert = """INSERT INTO origin (url) values (%s) RETURNING url""" cur.execute(insert, (url,)) return cur.fetchone()[0] origin_cols = ['url'] def origin_get_by_url(self, origins, cur=None): """Retrieve origin `(type, url)` from urls if found.""" cur = self._cursor(cur) query = """SELECT %s FROM (VALUES %%s) as t(url) LEFT JOIN origin ON t.url = origin.url """ % ','.join('origin.' + col for col in self.origin_cols) yield from execute_values_generator( cur, query, ((url,) for url in origins)) def origin_get_by_sha1(self, sha1s, cur=None): """Retrieve origin urls from sha1s if found.""" cur = self._cursor(cur) query = """SELECT %s FROM (VALUES %%s) as t(sha1) LEFT JOIN origin ON t.sha1 = digest(origin.url, 'sha1') """ % ','.join('origin.' + col for col in self.origin_cols) yield from execute_values_generator( cur, query, ((sha1,) for sha1 in sha1s)) def origin_visit_get_random(self, type, cur=None): """Randomly select one origin whose last visit was full in the last 3 months """ cur = self._cursor(cur) columns = ','.join(self.origin_visit_select_cols) - query = f"""select {columns} - from origin_visit tablesample bernoulli (1) + query = f"""with visits as ( + select * + from origin_visit + where origin_visit.status='full' and + origin_visit.type=%s and + origin_visit.date > now() - '3 months'::interval + ) + select {columns} + from visits as origin_visit inner join origin - on origin_visit.origin = origin.id - where origin_visit.status='full' and - origin_visit.type=%s and - origin_visit.date > now() - '3 months'::interval + on origin_visit.origin=origin.id + where random() < 0.1 limit 1 """ cur.execute(query, (type, )) return cur.fetchone() def origin_id_get_by_url(self, origins, cur=None): """Retrieve origin `(type, url)` from urls if found.""" cur = self._cursor(cur) query = """SELECT id FROM (VALUES %s) as t(url) LEFT JOIN origin ON t.url = origin.url """ for row in execute_values_generator( cur, query, ((url,) for url in origins)): yield row[0] origin_get_range_cols = ['id', 'url'] def origin_get_range(self, origin_from=1, origin_count=100, cur=None): """Retrieve ``origin_count`` origins whose ids are greater or equal than ``origin_from``. Origins are sorted by id before retrieving them. Args: origin_from (int): the minimum id of origins to retrieve origin_count (int): the maximum number of origins to retrieve """ cur = self._cursor(cur) query = """SELECT %s FROM origin WHERE id >= %%s ORDER BY id LIMIT %%s """ % ','.join(self.origin_get_range_cols) cur.execute(query, (origin_from, origin_count)) yield from cur def _origin_query(self, url_pattern, count=False, offset=0, limit=50, regexp=False, with_visit=False, cur=None): """ Method factorizing query creation for searching and counting origins. """ cur = self._cursor(cur) if count: origin_cols = 'COUNT(*)' else: origin_cols = ','.join(self.origin_cols) query = """SELECT %s FROM origin WHERE """ if with_visit: query += """ EXISTS ( SELECT 1 FROM origin_visit INNER JOIN snapshot ON snapshot=snapshot.id WHERE origin=origin.id ) AND """ query += 'url %s %%s ' if not count: query += 'ORDER BY id OFFSET %%s LIMIT %%s' if not regexp: query = query % (origin_cols, 'ILIKE') query_params = ('%'+url_pattern+'%', offset, limit) else: query = query % (origin_cols, '~*') query_params = (url_pattern, offset, limit) if count: query_params = (query_params[0],) cur.execute(query, query_params) def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, with_visit=False, cur=None): """Search for origins whose urls contain a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. Args: url_pattern (str): the string pattern to search for in origin urls offset (int): number of found origins to skip before returning results limit (int): the maximum number of found origins to return regexp (bool): if True, consider the provided pattern as a regular expression and returns origins whose urls match it with_visit (bool): if True, filter out origins with no visit """ self._origin_query(url_pattern, offset=offset, limit=limit, regexp=regexp, with_visit=with_visit, cur=cur) yield from cur def origin_count(self, url_pattern, regexp=False, with_visit=False, cur=None): """Count origins whose urls contain a provided string pattern or match a provided regular expression. The pattern search in origin urls is performed in a case insensitive way. Args: url_pattern (str): the string pattern to search for in origin urls regexp (bool): if True, consider the provided pattern as a regular expression and returns origins whose urls match it with_visit (bool): if True, filter out origins with no visit """ self._origin_query(url_pattern, count=True, regexp=regexp, with_visit=with_visit, cur=cur) return cur.fetchone()[0] release_add_cols = [ 'id', 'target', 'target_type', 'date', 'date_offset', 'date_neg_utc_offset', 'name', 'comment', 'synthetic', 'author_fullname', 'author_name', 'author_email', ] release_get_cols = release_add_cols def release_get_from_list(self, releases, cur=None): cur = self._cursor(cur) query_keys = ', '.join( self.mangle_query_key(k, 'release') for k in self.release_get_cols ) yield from execute_values_generator( cur, """ SELECT %s FROM (VALUES %%s) as t(id) LEFT JOIN release ON t.id = release.id LEFT JOIN person author ON release.author = author.id """ % query_keys, ((id,) for id in releases)) def origin_metadata_add(self, origin, ts, provider, tool, metadata, cur=None): """ Add an origin_metadata for the origin at ts with provider, tool and metadata. Args: origin (int): the origin's id for which the metadata is added ts (datetime): time when the metadata was found provider (int): the metadata provider identifier tool (int): the tool's identifier used to extract metadata metadata (jsonb): the metadata retrieved at the time and location Returns: id (int): the origin_metadata unique id """ cur = self._cursor(cur) insert = """INSERT INTO origin_metadata (origin_id, discovery_date, provider_id, tool_id, metadata) SELECT id, %s, %s, %s, %s FROM origin WHERE url = %s""" cur.execute(insert, (ts, provider, tool, jsonize(metadata), origin)) origin_metadata_get_cols = ['origin_url', 'discovery_date', 'tool_id', 'metadata', 'provider_id', 'provider_name', 'provider_type', 'provider_url'] def origin_metadata_get_by(self, origin_url, provider_type=None, cur=None): """Retrieve all origin_metadata entries for one origin_url """ cur = self._cursor(cur) if not provider_type: query = '''SELECT %s FROM swh_origin_metadata_get_by_origin( %%s)''' % (','.join( self.origin_metadata_get_cols)) cur.execute(query, (origin_url, )) else: query = '''SELECT %s FROM swh_origin_metadata_get_by_provider_type( %%s, %%s)''' % (','.join( self.origin_metadata_get_cols)) cur.execute(query, (origin_url, provider_type)) yield from cur tool_cols = ['id', 'name', 'version', 'configuration'] @stored_procedure('swh_mktemp_tool') def mktemp_tool(self, cur=None): pass def tool_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("SELECT %s from swh_tool_add()" % ( ','.join(self.tool_cols), )) yield from cur def tool_get(self, name, version, configuration, cur=None): cur = self._cursor(cur) cur.execute('''select %s from tool where name=%%s and version=%%s and configuration=%%s''' % ( ','.join(self.tool_cols)), (name, version, configuration)) return cur.fetchone() metadata_provider_cols = ['id', 'provider_name', 'provider_type', 'provider_url', 'metadata'] def metadata_provider_add(self, provider_name, provider_type, provider_url, metadata, cur=None): """Insert a new provider and return the new identifier.""" cur = self._cursor(cur) insert = """INSERT INTO metadata_provider (provider_name, provider_type, provider_url, metadata) values (%s, %s, %s, %s) RETURNING id""" cur.execute(insert, (provider_name, provider_type, provider_url, jsonize(metadata))) return cur.fetchone()[0] def metadata_provider_get(self, provider_id, cur=None): cur = self._cursor(cur) cur.execute('''select %s from metadata_provider where id=%%s ''' % ( ','.join(self.metadata_provider_cols)), (provider_id, )) return cur.fetchone() def metadata_provider_get_by(self, provider_name, provider_url, cur=None): cur = self._cursor(cur) cur.execute('''select %s from metadata_provider where provider_name=%%s and provider_url=%%s''' % ( ','.join(self.metadata_provider_cols)), (provider_name, provider_url)) return cur.fetchone() diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql index a093850f..b9d21816 100644 --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -1,469 +1,468 @@ --- --- SQL implementation of the Software Heritage data model --- -- schema versions create table dbversion ( version int primary key, release timestamptz, description text ); comment on table dbversion is 'Details of current db version'; comment on column dbversion.version is 'SQL schema version'; comment on column dbversion.release is 'Version deployment timestamp'; comment on column dbversion.description is 'Release description'; -- latest schema version insert into dbversion(version, release, description) - values(144, now(), 'Work In Progress'); + values(145, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a Git-style salted SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); -- a SHA256 checksum create domain sha256 as bytea check (length(value) = 32); -- a blake2 checksum create domain blake2s256 as bytea check (length(value) = 32); -- UNIX path (absolute, relative, individual path component, etc.) create domain unix_path as bytea; -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod create domain file_perms as int; -- Checksums about actual file content. Note that the content itself is not -- stored in the DB, but on external (key-value) storage. A single checksum is -- used as key there, but the other can be used to verify that we do not inject -- content collisions not knowingly. create table content ( sha1 sha1 not null, sha1_git sha1_git not null, sha256 sha256 not null, blake2s256 blake2s256, length bigint not null, ctime timestamptz not null default now(), -- creation time, i.e. time of (first) injection into the storage status content_status not null default 'visible', object_id bigserial ); comment on table content is 'Checksums of file content which is actually stored externally'; comment on column content.sha1 is 'Content sha1 hash'; comment on column content.sha1_git is 'Git object sha1 hash'; comment on column content.sha256 is 'Content Sha256 hash'; comment on column content.blake2s256 is 'Content blake2s hash'; comment on column content.length is 'Content length'; comment on column content.ctime is 'First seen time'; comment on column content.status is 'Content status (absent, visible, hidden)'; comment on column content.object_id is 'Content identifier'; -- An origin is a place, identified by an URL, where software source code -- artifacts can be found. We support different kinds of origins, e.g., git and -- other VCS repositories, web pages that list tarballs URLs (e.g., -- http://www.kernel.org), indirect tarball URLs (e.g., -- http://www.example.org/latest.tar.gz), etc. The key feature of an origin is -- that it can be *fetched* from (wget, git clone, svn checkout, etc.) to -- retrieve all the contained software. create table origin ( id bigserial not null, url text not null ); comment on column origin.id is 'Artifact origin id'; comment on column origin.url is 'URL of origin'; -- Content blobs observed somewhere, but not ingested into the archive for -- whatever reason. This table is separate from the content table as we might -- not have the sha1 checksum of skipped contents (for instance when we inject -- git repositories, objects that are too big will be skipped here, and we will -- only know their sha1_git). 'reason' contains the reason the content was -- skipped. origin is a nullable column allowing to find out which origin -- contains that skipped content. create table skipped_content ( sha1 sha1, sha1_git sha1_git, sha256 sha256, blake2s256 blake2s256, length bigint not null, ctime timestamptz not null default now(), status content_status not null default 'absent', reason text not null, origin bigint, object_id bigserial ); comment on table skipped_content is 'Content blobs observed, but not ingested in the archive'; comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; comment on column skipped_content.sha1_git is 'Git object sha1 hash'; comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; comment on column skipped_content.length is 'Skipped content length'; comment on column skipped_content.ctime is 'First seen time'; comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; comment on column skipped_content.reason is 'Reason for skipping'; comment on column skipped_content.origin is 'Origin table identifier'; comment on column skipped_content.object_id is 'Skipped content identifier'; -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). -- -- To list the contents of a directory: -- 1. list the contained directory_entry_dir using array dir_entries -- 2. list the contained directory_entry_file using array file_entries -- 3. list the contained directory_entry_rev using array rev_entries -- 4. UNION -- -- Synonyms/mappings: -- * git: tree create table directory ( id sha1_git not null, dir_entries bigint[], -- sub-directories, reference directory_entry_dir file_entries bigint[], -- contained files, reference directory_entry_file rev_entries bigint[], -- mounted revisions, reference directory_entry_rev object_id bigserial -- short object identifier ); comment on table directory is 'Contents of a directory, synonymous to tree (git)'; comment on column directory.id is 'Git object sha1 hash'; comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir'; comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; comment on column directory.object_id is 'Short object identifier'; -- A directory entry pointing to a (sub-)directory. create table directory_entry_dir ( id bigserial, target sha1_git not null, -- id of target directory name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_dir is 'Directory entry for directory'; comment on column directory_entry_dir.id is 'Directory identifier'; comment on column directory_entry_dir.target is 'Target directory identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- A directory entry pointing to a file content. create table directory_entry_file ( id bigserial, target sha1_git not null, -- id of target file name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_file is 'Directory entry for file'; comment on column directory_entry_file.id is 'File identifier'; comment on column directory_entry_file.target is 'Target file identifier'; comment on column directory_entry_file.name is 'Path name, relative to containing directory'; comment on column directory_entry_file.perms is 'Unix-like permissions'; -- A directory entry pointing to a revision. create table directory_entry_rev ( id bigserial, target sha1_git not null, -- id of target revision name unix_path not null, -- path name, relative to containing dir perms file_perms not null -- unix-like permissions ); comment on table directory_entry_rev is 'Directory entry for revision'; comment on column directory_entry_dir.id is 'Revision identifier'; comment on column directory_entry_dir.target is 'Target revision in identifier'; comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; comment on column directory_entry_dir.perms is 'Unix-like permissions'; -- A person referenced by some source code artifacts, e.g., a VCS revision or -- release metadata. create table person ( id bigserial, name bytea, -- advisory: not null if we managed to parse a name email bytea, -- advisory: not null if we managed to parse an email fullname bytea not null -- freeform specification; what is actually used in the checksums -- will usually be of the form 'name ' ); comment on table person is 'Person referenced in code artifact release metadata'; comment on column person.id is 'Person identifier'; comment on column person.name is 'Name'; comment on column person.email is 'Email'; comment on column person.fullname is 'Full name (raw name)'; -- The state of a source code tree at a specific point in time. -- -- Synonyms/mappings: -- * git / subversion / etc: commit -- * tarball: a specific tarball -- -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in -- case of merges) parent revisions. Each revision points to a directory, i.e., -- a file-system tree containing files and directories. create table revision ( id sha1_git not null, date timestamptz, date_offset smallint, committer_date timestamptz, committer_date_offset smallint, type revision_type not null, directory sha1_git, -- source code 'root' directory message bytea, author bigint, committer bigint, synthetic boolean not null default false, -- true iff revision has been created by Software Heritage metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...) object_id bigserial, date_neg_utc_offset boolean, committer_date_neg_utc_offset boolean ); comment on table revision is 'Revision represents the state of a source code tree at a specific point in time'; comment on column revision.id is 'Git id of sha1 checksum'; comment on column revision.date is 'Timestamp when revision was authored'; comment on column revision.date_offset is 'Authored timestamp offset from UTC'; comment on column revision.committer_date is 'Timestamp when revision was committed'; comment on column revision.committer_date_offset is 'Committed timestamp offset from UTC'; comment on column revision.type is 'Possible revision types (''git'', ''tar'', ''dsc'', ''svn'', ''hg'')'; comment on column revision.directory is 'Directory identifier'; comment on column revision.message is 'Revision message'; comment on column revision.author is 'Author identifier'; comment on column revision.committer is 'Committer identifier'; comment on column revision.synthetic is 'true iff revision has been created by Software Heritage'; comment on column revision.metadata is 'extra metadata (tarball checksums, extra commit information, etc...)'; comment on column revision.object_id is 'Object identifier'; comment on column revision.date_neg_utc_offset is 'True indicates -0 UTC offset for author timestamp'; comment on column revision.committer_date_neg_utc_offset is 'True indicates -0 UTC offset for committer timestamp'; -- either this table or the sha1_git[] column on the revision table create table revision_history ( id sha1_git not null, parent_id sha1_git not null, parent_rank int not null default 0 -- parent position in merge commits, 0-based ); comment on table revision_history is 'Sequence of revision history with parent and position in history'; comment on column revision_history.id is 'Revision history git object sha1 checksum'; comment on column revision_history.parent_id is 'Parent revision git object identifier'; comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; -- Crawling history of software origins visited by Software Heritage. Each -- visit is a 3-way mapping between a software origin, a timestamp, and a -- snapshot object capturing the full-state of the origin at visit time. create table origin_visit ( origin bigint not null, visit bigint not null, date timestamptz not null, type text not null, status origin_visit_status not null, metadata jsonb, snapshot sha1_git ); comment on column origin_visit.origin is 'Visited origin'; comment on column origin_visit.visit is 'Sequential visit number for the origin'; comment on column origin_visit.date is 'Visit timestamp'; comment on column origin_visit.type is 'Type of loader that did the visit (hg, git, ...)'; comment on column origin_visit.status is 'Visit result'; comment on column origin_visit.metadata is 'Origin metadata at visit time'; comment on column origin_visit.snapshot is 'Origin snapshot at visit time'; -- A snapshot represents the entire state of a software origin as crawled by -- Software Heritage. This table is a simple mapping between (public) intrinsic -- snapshot identifiers and (private) numeric sequential identifiers. create table snapshot ( object_id bigserial not null, -- PK internal object identifier id sha1_git not null -- snapshot intrinsic identifier ); comment on table snapshot is 'State of a software origin as crawled by Software Heritage'; comment on column snapshot.object_id is 'Internal object identifier'; comment on column snapshot.id is 'Intrinsic snapshot identifier'; -- Each snapshot associate "branch" names to other objects in the Software -- Heritage Merkle DAG. This table describes branches as mappings between names -- and target typed objects. create table snapshot_branch ( object_id bigserial not null, -- PK internal object identifier name bytea not null, -- branch name, e.g., "master" or "feature/drag-n-drop" target bytea, -- target object identifier, e.g., a revision identifier target_type snapshot_target -- target object type, e.g., "revision" ); comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG'; comment on column snapshot_branch.object_id is 'Internal object identifier'; comment on column snapshot_branch.name is 'Branch name'; comment on column snapshot_branch.target is 'Target object identifier'; comment on column snapshot_branch.target_type is 'Target object type'; -- Mapping between snapshots and their branches. create table snapshot_branches ( snapshot_id bigint not null, -- snapshot identifier, ref. snapshot.object_id branch_id bigint not null -- branch identifier, ref. snapshot_branch.object_id ); comment on table snapshot_branches is 'Mapping between snapshot and their branches'; comment on column snapshot_branches.snapshot_id is 'Snapshot identifier'; comment on column snapshot_branches.branch_id is 'Branch identifier'; -- A "memorable" point in time in the development history of a software -- project. -- -- Synonyms/mappings: -- * git: tag (of the annotated kind, otherwise they are just references) -- * tarball: the release version number create table release ( id sha1_git not null, target sha1_git, date timestamptz, date_offset smallint, name bytea, comment bytea, author bigint, synthetic boolean not null default false, -- true iff release has been created by Software Heritage object_id bigserial, target_type object_type not null, date_neg_utc_offset boolean ); comment on table release is 'Details of a software release, synonymous with a tag (git) or version number (tarball)'; comment on column release.id is 'Release git identifier'; comment on column release.target is 'Target git identifier'; comment on column release.date is 'Release timestamp'; comment on column release.date_offset is 'Timestamp offset from UTC'; comment on column release.name is 'Name'; comment on column release.comment is 'Comment'; comment on column release.author is 'Author'; comment on column release.synthetic is 'Indicates if created by Software Heritage'; comment on column release.object_id is 'Object identifier'; comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; -- Tools create table tool ( id serial not null, name text not null, version text not null, configuration jsonb ); comment on table tool is 'Tool information'; comment on column tool.id is 'Tool identifier'; comment on column tool.version is 'Tool name'; comment on column tool.version is 'Tool version'; comment on column tool.configuration is 'Tool configuration: command line, flags, etc...'; create table metadata_provider ( id serial not null, provider_name text not null, provider_type text not null, provider_url text, metadata jsonb ); comment on table metadata_provider is 'Metadata provider information'; comment on column metadata_provider.id is 'Provider''s identifier'; comment on column metadata_provider.provider_name is 'Provider''s name'; comment on column metadata_provider.provider_url is 'Provider''s url'; comment on column metadata_provider.metadata is 'Other metadata about provider'; -- Discovery of metadata during a listing, loading, deposit or external_catalog of an origin -- also provides a translation to a defined json schema using a translation tool (tool_id) create table origin_metadata ( id bigserial not null, -- PK internal object identifier origin_id bigint not null, -- references origin(id) discovery_date timestamptz not null, -- when it was extracted provider_id bigint not null, -- ex: 'hal', 'lister-github', 'loader-github' tool_id bigint not null, metadata jsonb not null ); comment on table origin_metadata is 'keeps all metadata found concerning an origin'; comment on column origin_metadata.id is 'the origin_metadata object''s id'; comment on column origin_metadata.origin_id is 'the origin id for which the metadata was found'; comment on column origin_metadata.discovery_date is 'the date of retrieval'; comment on column origin_metadata.provider_id is 'the metadata provider: github, openhub, deposit, etc.'; comment on column origin_metadata.tool_id is 'the tool used for extracting metadata: lister-github, etc.'; comment on column origin_metadata.metadata is 'metadata in json format but with original terms'; -- Keep a cache of object counts create table object_counts ( object_type text, -- table for which we're counting objects (PK) value bigint, -- count of objects in the table last_update timestamptz, -- last update for the object count in this table single_update boolean -- whether we update this table standalone (true) or through bucketed counts (false) ); comment on table object_counts is 'Cache of object counts'; comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts.value is 'Count of objects in the table'; comment on column object_counts.last_update is 'Last update for object count'; comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)'; create table object_counts_bucketed ( line serial not null, -- PK object_type text not null, -- table for which we're counting objects identifier text not null, -- identifier across which we're bucketing objects bucket_start bytea, -- lower bound (inclusive) for the bucket bucket_end bytea, -- upper bound (exclusive) for the bucket value bigint, -- count of objects in the bucket last_update timestamptz -- last update for the object count in this bucket ); comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type'; comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value'; comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects'; comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket'; comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket'; - diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql index 182c510a..153fc0ff 100644 --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -1,179 +1,180 @@ -- content create unique index concurrently content_pkey on content(sha1); create unique index concurrently on content(sha1_git); create index concurrently on content(sha256); create index concurrently on content(blake2s256); create index concurrently on content(ctime); -- TODO use a BRIN index here (postgres >= 9.5) create unique index concurrently on content(object_id); alter table content add primary key using index content_pkey; -- origin create unique index concurrently origin_pkey on origin(id); alter table origin add primary key using index origin_pkey; create index concurrently on origin using gin (url gin_trgm_ops); create index concurrently on origin using hash (url); create index concurrently on origin using btree(digest(url, 'sha1')); -- skipped_content alter table skipped_content add constraint skipped_content_sha1_sha1_git_sha256_key unique (sha1, sha1_git, sha256); create index concurrently on skipped_content(sha1); create index concurrently on skipped_content(sha1_git); create index concurrently on skipped_content(sha256); create index concurrently on skipped_content(blake2s256); create unique index concurrently on skipped_content(object_id); alter table skipped_content add constraint skipped_content_origin_fkey foreign key (origin) references origin(id) not valid; alter table skipped_content validate constraint skipped_content_origin_fkey; -- directory create unique index concurrently directory_pkey on directory(id); alter table directory add primary key using index directory_pkey; create index concurrently on directory using gin (dir_entries); create index concurrently on directory using gin (file_entries); create index concurrently on directory using gin (rev_entries); create unique index concurrently on directory(object_id); -- directory_entry_dir create unique index concurrently directory_entry_dir_pkey on directory_entry_dir(id); alter table directory_entry_dir add primary key using index directory_entry_dir_pkey; create unique index concurrently on directory_entry_dir(target, name, perms); -- directory_entry_file create unique index concurrently directory_entry_file_pkey on directory_entry_file(id); alter table directory_entry_file add primary key using index directory_entry_file_pkey; create unique index concurrently on directory_entry_file(target, name, perms); -- directory_entry_rev create unique index concurrently directory_entry_rev_pkey on directory_entry_rev(id); alter table directory_entry_rev add primary key using index directory_entry_rev_pkey; create unique index concurrently on directory_entry_rev(target, name, perms); -- person create unique index concurrently person_pkey on person(id); alter table person add primary key using index person_pkey; create unique index concurrently on person(fullname); create index concurrently on person(name); create index concurrently on person(email); -- revision create unique index concurrently revision_pkey on revision(id); alter table revision add primary key using index revision_pkey; alter table revision add constraint revision_author_fkey foreign key (author) references person(id) not valid; alter table revision validate constraint revision_author_fkey; alter table revision add constraint revision_committer_fkey foreign key (committer) references person(id) not valid; alter table revision validate constraint revision_committer_fkey; create index concurrently on revision(directory); create unique index concurrently on revision(object_id); -- revision_history create unique index concurrently revision_history_pkey on revision_history(id, parent_rank); alter table revision_history add primary key using index revision_history_pkey; create index concurrently on revision_history(parent_id); alter table revision_history add constraint revision_history_id_fkey foreign key (id) references revision(id) not valid; alter table revision_history validate constraint revision_history_id_fkey; -- snapshot create unique index concurrently snapshot_pkey on snapshot(object_id); alter table snapshot add primary key using index snapshot_pkey; create unique index concurrently on snapshot(id); -- snapshot_branch create unique index concurrently snapshot_branch_pkey on snapshot_branch(object_id); alter table snapshot_branch add primary key using index snapshot_branch_pkey; create unique index concurrently on snapshot_branch (target_type, target, name); alter table snapshot_branch add constraint snapshot_branch_target_check check ((target_type is null) = (target is null)) not valid; alter table snapshot_branch validate constraint snapshot_branch_target_check; alter table snapshot_branch add constraint snapshot_target_check check (target_type not in ('content', 'directory', 'revision', 'release', 'snapshot') or length(target) = 20) not valid; alter table snapshot_branch validate constraint snapshot_target_check; create unique index concurrently on snapshot_branch (name) where target_type is null and target is null; -- snapshot_branches create unique index concurrently snapshot_branches_pkey on snapshot_branches(snapshot_id, branch_id); alter table snapshot_branches add primary key using index snapshot_branches_pkey; alter table snapshot_branches add constraint snapshot_branches_snapshot_id_fkey foreign key (snapshot_id) references snapshot(object_id) not valid; alter table snapshot_branches validate constraint snapshot_branches_snapshot_id_fkey; alter table snapshot_branches add constraint snapshot_branches_branch_id_fkey foreign key (branch_id) references snapshot_branch(object_id) not valid; alter table snapshot_branches validate constraint snapshot_branches_branch_id_fkey; -- origin_visit create unique index concurrently origin_visit_pkey on origin_visit(origin, visit); alter table origin_visit add primary key using index origin_visit_pkey; create index concurrently on origin_visit(date); +create index concurrently on origin_visit(type, status, date); alter table origin_visit add constraint origin_visit_origin_fkey foreign key (origin) references origin(id) not valid; alter table origin_visit validate constraint origin_visit_origin_fkey; -- release create unique index concurrently release_pkey on release(id); alter table release add primary key using index release_pkey; create index concurrently on release(target, target_type); create unique index concurrently on release(object_id); alter table release add constraint release_author_fkey foreign key (author) references person(id) not valid; alter table release validate constraint release_author_fkey; -- if the author is null, then the date must be null alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid; alter table release validate constraint release_author_date_check; -- tool create unique index tool_pkey on tool(id); alter table tool add primary key using index tool_pkey; create unique index on tool(name, version, configuration); -- metadata_provider create unique index concurrently metadata_provider_pkey on metadata_provider(id); alter table metadata_provider add primary key using index metadata_provider_pkey; create index concurrently on metadata_provider(provider_name, provider_url); -- origin_metadata create unique index concurrently origin_metadata_pkey on origin_metadata(id); alter table origin_metadata add primary key using index origin_metadata_pkey; create index concurrently on origin_metadata(origin_id, provider_id, tool_id); alter table origin_metadata add constraint origin_metadata_origin_fkey foreign key (origin_id) references origin(id) not valid; alter table origin_metadata validate constraint origin_metadata_origin_fkey; alter table origin_metadata add constraint origin_metadata_provider_fkey foreign key (provider_id) references metadata_provider(id) not valid; alter table origin_metadata validate constraint origin_metadata_provider_fkey; alter table origin_metadata add constraint origin_metadata_tool_fkey foreign key (tool_id) references tool(id) not valid; alter table origin_metadata validate constraint origin_metadata_tool_fkey; -- object_counts create unique index concurrently object_counts_pkey on object_counts(object_type); alter table object_counts add primary key using index object_counts_pkey; -- object_counts_bucketed create unique index concurrently object_counts_bucketed_pkey on object_counts_bucketed(line); alter table object_counts_bucketed add primary key using index object_counts_bucketed_pkey;