Page MenuHomeSoftware Heritage

D5014.id17885.diff
No OneTemporary

D5014.id17885.diff

diff --git a/swh/storage/tests/data/sql-v0.18.0/10-superuser-init.sql b/swh/storage/tests/data/sql-v0.18.0/10-superuser-init.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/10-superuser-init.sql
@@ -0,0 +1,27 @@
+-- require being Postgres super user
+
+create extension if not exists btree_gist;
+create extension if not exists pgcrypto;
+create extension if not exists pg_trgm;
+
+-- courtesy of Andreas 'ads' Scherbaum in
+-- https://andreas.scherbaum.la/blog/archives/346-create-language-if-not-exist.html
+create or replace function public.create_plpgsql_language ()
+ returns text
+ as $$
+ create language plpgsql;
+ select 'language plpgsql created'::text;
+ $$
+language 'sql';
+
+select case when
+ (select true::boolean
+ from pg_language
+ where lanname='plpgsql')
+ then
+ (select 'language already installed'::text)
+ else
+ (select public.create_plpgsql_language())
+ end;
+
+drop function public.create_plpgsql_language ();
diff --git a/swh/storage/tests/data/sql-v0.18.0/15-flavor.sql b/swh/storage/tests/data/sql-v0.18.0/15-flavor.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/15-flavor.sql
@@ -0,0 +1,22 @@
+-- database flavor
+create type database_flavor as enum (
+ 'default', -- default: full index availability for deduplication and read queries
+ 'mirror', -- mirror: reduced indexes to allow for out of order insertions
+ 'read_replica' -- read replica: minimal indexes to allow read queries
+);
+comment on type database_flavor is 'Flavor of the current database';
+
+create table dbflavor (
+ flavor database_flavor,
+ single_row char(1) primary key default 'x',
+ check (single_row = 'x')
+);
+comment on table dbflavor is 'Database flavor storage';
+comment on column dbflavor.flavor is 'Database flavor currently deployed';
+comment on column dbflavor.single_row is 'Bogus column to force the table to have a single row';
+
+create or replace function swh_get_dbflavor() returns database_flavor language sql stable as $$
+ select coalesce((select flavor from dbflavor), 'default');
+$$;
+
+comment on function swh_get_dbflavor is 'Get the flavor of the database currently deployed';
diff --git a/swh/storage/tests/data/sql-v0.18.0/20-enums.sql b/swh/storage/tests/data/sql-v0.18.0/20-enums.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/20-enums.sql
@@ -0,0 +1,23 @@
+---
+--- Software Heritage Data Types
+---
+
+create type content_status as enum ('absent', 'visible', 'hidden');
+comment on type content_status is 'Content visibility';
+
+create type revision_type as enum ('git', 'tar', 'dsc', 'svn', 'hg');
+comment on type revision_type is 'Possible revision types';
+
+create type object_type as enum ('content', 'directory', 'revision', 'release', 'snapshot');
+comment on type object_type is 'Data object types stored in data model';
+
+create type snapshot_target as enum ('content', 'directory', 'revision', 'release', 'snapshot', 'alias');
+comment on type snapshot_target is 'Types of targets for snapshot branches';
+
+create type origin_visit_state as enum (
+ 'created',
+ 'ongoing',
+ 'full',
+ 'partial'
+);
+comment on type origin_visit_state IS 'Possible origin visit values';
diff --git a/swh/storage/tests/data/sql-v0.18.0/30-schema.sql b/swh/storage/tests/data/sql-v0.18.0/30-schema.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/30-schema.sql
@@ -0,0 +1,499 @@
+---
+--- SQL implementation of the Software Heritage data model
+---
+
+-- schema versions
+create table dbversion
+(
+ version int primary key,
+ release timestamptz,
+ description text
+);
+
+comment on table dbversion is 'Details of current db version';
+comment on column dbversion.version is 'SQL schema version';
+comment on column dbversion.release is 'Version deployment timestamp';
+comment on column dbversion.description is 'Release description';
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(164, now(), 'Work In Progress');
+
+-- a SHA1 checksum
+create domain sha1 as bytea check (length(value) = 20);
+
+-- a Git object ID, i.e., a Git-style salted SHA1 checksum
+create domain sha1_git as bytea check (length(value) = 20);
+
+-- a SHA256 checksum
+create domain sha256 as bytea check (length(value) = 32);
+
+-- a blake2 checksum
+create domain blake2s256 as bytea check (length(value) = 32);
+
+-- UNIX path (absolute, relative, individual path component, etc.)
+create domain unix_path as bytea;
+
+-- a set of UNIX-like access permissions, as manipulated by, e.g., chmod
+create domain file_perms as int;
+
+-- an SWHID
+create domain swhid as text check (value ~ '^swh:[0-9]+:.*');
+
+
+-- Checksums about actual file content. Note that the content itself is not
+-- stored in the DB, but on external (key-value) storage. A single checksum is
+-- used as key there, but the other can be used to verify that we do not inject
+-- content collisions not knowingly.
+create table content
+(
+ sha1 sha1 not null,
+ sha1_git sha1_git not null,
+ sha256 sha256 not null,
+ blake2s256 blake2s256 not null,
+ length bigint not null,
+ ctime timestamptz not null default now(),
+ -- creation time, i.e. time of (first) injection into the storage
+ status content_status not null default 'visible',
+ object_id bigserial
+);
+
+comment on table content is 'Checksums of file content which is actually stored externally';
+comment on column content.sha1 is 'Content sha1 hash';
+comment on column content.sha1_git is 'Git object sha1 hash';
+comment on column content.sha256 is 'Content Sha256 hash';
+comment on column content.blake2s256 is 'Content blake2s hash';
+comment on column content.length is 'Content length';
+comment on column content.ctime is 'First seen time';
+comment on column content.status is 'Content status (absent, visible, hidden)';
+comment on column content.object_id is 'Content identifier';
+
+
+-- An origin is a place, identified by an URL, where software source code
+-- artifacts can be found. We support different kinds of origins, e.g., git and
+-- other VCS repositories, web pages that list tarballs URLs (e.g.,
+-- http://www.kernel.org), indirect tarball URLs (e.g.,
+-- http://www.example.org/latest.tar.gz), etc. The key feature of an origin is
+-- that it can be *fetched* from (wget, git clone, svn checkout, etc.) to
+-- retrieve all the contained software.
+create table origin
+(
+ id bigserial not null,
+ url text not null
+);
+
+comment on column origin.id is 'Artifact origin id';
+comment on column origin.url is 'URL of origin';
+
+
+-- Content blobs observed somewhere, but not ingested into the archive for
+-- whatever reason. This table is separate from the content table as we might
+-- not have the sha1 checksum of skipped contents (for instance when we inject
+-- git repositories, objects that are too big will be skipped here, and we will
+-- only know their sha1_git). 'reason' contains the reason the content was
+-- skipped. origin is a nullable column allowing to find out which origin
+-- contains that skipped content.
+create table skipped_content
+(
+ sha1 sha1,
+ sha1_git sha1_git,
+ sha256 sha256,
+ blake2s256 blake2s256,
+ length bigint not null,
+ ctime timestamptz not null default now(),
+ status content_status not null default 'absent',
+ reason text not null,
+ origin bigint,
+ object_id bigserial
+);
+
+comment on table skipped_content is 'Content blobs observed, but not ingested in the archive';
+comment on column skipped_content.sha1 is 'Skipped content sha1 hash';
+comment on column skipped_content.sha1_git is 'Git object sha1 hash';
+comment on column skipped_content.sha256 is 'Skipped content sha256 hash';
+comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash';
+comment on column skipped_content.length is 'Skipped content length';
+comment on column skipped_content.ctime is 'First seen time';
+comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)';
+comment on column skipped_content.reason is 'Reason for skipping';
+comment on column skipped_content.origin is 'Origin table identifier';
+comment on column skipped_content.object_id is 'Skipped content identifier';
+
+
+-- A file-system directory. A directory is a list of directory entries (see
+-- tables: directory_entry_{dir,file}).
+--
+-- To list the contents of a directory:
+-- 1. list the contained directory_entry_dir using array dir_entries
+-- 2. list the contained directory_entry_file using array file_entries
+-- 3. list the contained directory_entry_rev using array rev_entries
+-- 4. UNION
+--
+-- Synonyms/mappings:
+-- * git: tree
+create table directory
+(
+ id sha1_git not null,
+ dir_entries bigint[], -- sub-directories, reference directory_entry_dir
+ file_entries bigint[], -- contained files, reference directory_entry_file
+ rev_entries bigint[], -- mounted revisions, reference directory_entry_rev
+ object_id bigserial -- short object identifier
+);
+
+comment on table directory is 'Contents of a directory, synonymous to tree (git)';
+comment on column directory.id is 'Git object sha1 hash';
+comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir';
+comment on column directory.file_entries is 'Contained files, reference directory_entry_file';
+comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev';
+comment on column directory.object_id is 'Short object identifier';
+
+
+-- A directory entry pointing to a (sub-)directory.
+create table directory_entry_dir
+(
+ id bigserial,
+ target sha1_git not null, -- id of target directory
+ name unix_path not null, -- path name, relative to containing dir
+ perms file_perms not null -- unix-like permissions
+);
+
+comment on table directory_entry_dir is 'Directory entry for directory';
+comment on column directory_entry_dir.id is 'Directory identifier';
+comment on column directory_entry_dir.target is 'Target directory identifier';
+comment on column directory_entry_dir.name is 'Path name, relative to containing directory';
+comment on column directory_entry_dir.perms is 'Unix-like permissions';
+
+
+-- A directory entry pointing to a file content.
+create table directory_entry_file
+(
+ id bigserial,
+ target sha1_git not null, -- id of target file
+ name unix_path not null, -- path name, relative to containing dir
+ perms file_perms not null -- unix-like permissions
+);
+
+comment on table directory_entry_file is 'Directory entry for file';
+comment on column directory_entry_file.id is 'File identifier';
+comment on column directory_entry_file.target is 'Target file identifier';
+comment on column directory_entry_file.name is 'Path name, relative to containing directory';
+comment on column directory_entry_file.perms is 'Unix-like permissions';
+
+
+-- A directory entry pointing to a revision.
+create table directory_entry_rev
+(
+ id bigserial,
+ target sha1_git not null, -- id of target revision
+ name unix_path not null, -- path name, relative to containing dir
+ perms file_perms not null -- unix-like permissions
+);
+
+comment on table directory_entry_rev is 'Directory entry for revision';
+comment on column directory_entry_dir.id is 'Revision identifier';
+comment on column directory_entry_dir.target is 'Target revision in identifier';
+comment on column directory_entry_dir.name is 'Path name, relative to containing directory';
+comment on column directory_entry_dir.perms is 'Unix-like permissions';
+
+
+-- A person referenced by some source code artifacts, e.g., a VCS revision or
+-- release metadata.
+create table person
+(
+ id bigserial,
+ name bytea, -- advisory: not null if we managed to parse a name
+ email bytea, -- advisory: not null if we managed to parse an email
+ fullname bytea not null -- freeform specification; what is actually used in the checksums
+ -- will usually be of the form 'name <email>'
+);
+
+comment on table person is 'Person referenced in code artifact release metadata';
+comment on column person.id is 'Person identifier';
+comment on column person.name is 'Name';
+comment on column person.email is 'Email';
+comment on column person.fullname is 'Full name (raw name)';
+
+
+-- The state of a source code tree at a specific point in time.
+--
+-- Synonyms/mappings:
+-- * git / subversion / etc: commit
+-- * tarball: a specific tarball
+--
+-- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in
+-- case of merges) parent revisions. Each revision points to a directory, i.e.,
+-- a file-system tree containing files and directories.
+create table revision
+(
+ id sha1_git not null,
+ date timestamptz,
+ date_offset smallint,
+ committer_date timestamptz,
+ committer_date_offset smallint,
+ type revision_type not null,
+ directory sha1_git, -- source code 'root' directory
+ message bytea,
+ author bigint,
+ committer bigint,
+ synthetic boolean not null default false, -- true iff revision has been created by Software Heritage
+ metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...)
+ object_id bigserial,
+ date_neg_utc_offset boolean,
+ committer_date_neg_utc_offset boolean,
+ extra_headers bytea[][] not null -- extra headers (used in hash computation)
+);
+
+comment on table revision is 'A revision represents the state of a source code tree at a specific point in time';
+comment on column revision.id is 'Git-style SHA1 commit identifier';
+comment on column revision.date is 'Author timestamp as UNIX epoch';
+comment on column revision.date_offset is 'Author timestamp timezone, as minute offsets from UTC';
+comment on column revision.date_neg_utc_offset is 'True indicates a -0 UTC offset on author timestamp';
+comment on column revision.committer_date is 'Committer timestamp as UNIX epoch';
+comment on column revision.committer_date_offset is 'Committer timestamp timezone, as minute offsets from UTC';
+comment on column revision.committer_date_neg_utc_offset is 'True indicates a -0 UTC offset on committer timestamp';
+comment on column revision.type is 'Type of revision';
+comment on column revision.directory is 'Directory identifier';
+comment on column revision.message is 'Commit message';
+comment on column revision.author is 'Author identity';
+comment on column revision.committer is 'Committer identity';
+comment on column revision.synthetic is 'True iff revision has been synthesized by Software Heritage';
+comment on column revision.metadata is 'Extra revision metadata';
+comment on column revision.object_id is 'Non-intrinsic, sequential object identifier';
+comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation';
+
+
+-- either this table or the sha1_git[] column on the revision table
+create table revision_history
+(
+ id sha1_git not null,
+ parent_id sha1_git not null,
+ parent_rank int not null default 0
+ -- parent position in merge commits, 0-based
+);
+
+comment on table revision_history is 'Sequence of revision history with parent and position in history';
+comment on column revision_history.id is 'Revision history git object sha1 checksum';
+comment on column revision_history.parent_id is 'Parent revision git object identifier';
+comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based';
+
+
+-- Crawling history of software origins visited by Software Heritage. Each
+-- visit is a 3-way mapping between a software origin, a timestamp, and a
+-- snapshot object capturing the full-state of the origin at visit time.
+create table origin_visit
+(
+ origin bigint not null,
+ visit bigint not null,
+ date timestamptz not null,
+ type text not null
+);
+
+comment on column origin_visit.origin is 'Visited origin';
+comment on column origin_visit.visit is 'Sequential visit number for the origin';
+comment on column origin_visit.date is 'Visit timestamp';
+comment on column origin_visit.type is 'Type of loader that did the visit (hg, git, ...)';
+
+
+-- Crawling history of software origin visits by Software Heritage. Each
+-- visit see its history change through new origin visit status updates
+create table origin_visit_status
+(
+ origin bigint not null,
+ visit bigint not null,
+ date timestamptz not null,
+ status origin_visit_state not null,
+ metadata jsonb,
+ snapshot sha1_git
+);
+
+comment on column origin_visit_status.origin is 'Origin concerned by the visit update';
+comment on column origin_visit_status.visit is 'Visit concerned by the visit update';
+comment on column origin_visit_status.date is 'Visit update timestamp';
+comment on column origin_visit_status.status is 'Visit status (ongoing, failed, full)';
+comment on column origin_visit_status.metadata is 'Optional origin visit metadata';
+comment on column origin_visit_status.snapshot is 'Optional, possibly partial, snapshot of the origin visit. It can be partial.';
+
+
+-- A snapshot represents the entire state of a software origin as crawled by
+-- Software Heritage. This table is a simple mapping between (public) intrinsic
+-- snapshot identifiers and (private) numeric sequential identifiers.
+create table snapshot
+(
+ object_id bigserial not null, -- PK internal object identifier
+ id sha1_git not null -- snapshot intrinsic identifier
+);
+
+comment on table snapshot is 'State of a software origin as crawled by Software Heritage';
+comment on column snapshot.object_id is 'Internal object identifier';
+comment on column snapshot.id is 'Intrinsic snapshot identifier';
+
+
+-- Each snapshot associate "branch" names to other objects in the Software
+-- Heritage Merkle DAG. This table describes branches as mappings between names
+-- and target typed objects.
+create table snapshot_branch
+(
+ object_id bigserial not null, -- PK internal object identifier
+ name bytea not null, -- branch name, e.g., "master" or "feature/drag-n-drop"
+ target bytea, -- target object identifier, e.g., a revision identifier
+ target_type snapshot_target -- target object type, e.g., "revision"
+);
+
+comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG';
+comment on column snapshot_branch.object_id is 'Internal object identifier';
+comment on column snapshot_branch.name is 'Branch name';
+comment on column snapshot_branch.target is 'Target object identifier';
+comment on column snapshot_branch.target_type is 'Target object type';
+
+
+-- Mapping between snapshots and their branches.
+create table snapshot_branches
+(
+ snapshot_id bigint not null, -- snapshot identifier, ref. snapshot.object_id
+ branch_id bigint not null -- branch identifier, ref. snapshot_branch.object_id
+);
+
+comment on table snapshot_branches is 'Mapping between snapshot and their branches';
+comment on column snapshot_branches.snapshot_id is 'Snapshot identifier';
+comment on column snapshot_branches.branch_id is 'Branch identifier';
+
+
+-- A "memorable" point in time in the development history of a software
+-- project.
+--
+-- Synonyms/mappings:
+-- * git: tag (of the annotated kind, otherwise they are just references)
+-- * tarball: the release version number
+create table release
+(
+ id sha1_git not null,
+ target sha1_git,
+ date timestamptz,
+ date_offset smallint,
+ name bytea,
+ comment bytea,
+ author bigint,
+ synthetic boolean not null default false, -- true iff release has been created by Software Heritage
+ object_id bigserial,
+ target_type object_type not null,
+ date_neg_utc_offset boolean
+);
+
+comment on table release is 'Details of a software release, synonymous with
+ a tag (git) or version number (tarball)';
+comment on column release.id is 'Release git identifier';
+comment on column release.target is 'Target git identifier';
+comment on column release.date is 'Release timestamp';
+comment on column release.date_offset is 'Timestamp offset from UTC';
+comment on column release.name is 'Name';
+comment on column release.comment is 'Comment';
+comment on column release.author is 'Author';
+comment on column release.synthetic is 'Indicates if created by Software Heritage';
+comment on column release.object_id is 'Object identifier';
+comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'',
+ ''release'', ''snapshot'')';
+comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp';
+
+-- Tools
+create table metadata_fetcher
+(
+ id serial not null,
+ name text not null,
+ version text not null,
+ metadata jsonb not null
+);
+
+comment on table metadata_fetcher is 'Tools used to retrieve metadata';
+comment on column metadata_fetcher.id is 'Internal identifier of the fetcher';
+comment on column metadata_fetcher.name is 'Fetcher name';
+comment on column metadata_fetcher.version is 'Fetcher version';
+comment on column metadata_fetcher.metadata is 'Extra information about the fetcher';
+
+
+create table metadata_authority
+(
+ id serial not null,
+ type text not null,
+ url text not null,
+ metadata jsonb not null
+);
+
+comment on table metadata_authority is 'Metadata authority information';
+comment on column metadata_authority.id is 'Internal identifier of the authority';
+comment on column metadata_authority.type is 'Type of authority (deposit_client/forge/registry)';
+comment on column metadata_authority.url is 'Authority''s uri';
+comment on column metadata_authority.metadata is 'Other metadata about authority';
+
+
+-- Extrinsic metadata on a DAG objects and origins.
+create table raw_extrinsic_metadata
+(
+ type text not null,
+ target text not null,
+
+ -- metadata source
+ authority_id bigint not null,
+ fetcher_id bigint not null,
+ discovery_date timestamptz not null,
+
+ -- metadata itself
+ format text not null,
+ metadata bytea not null,
+
+ -- context
+ origin text,
+ visit bigint,
+ snapshot swhid,
+ release swhid,
+ revision swhid,
+ path bytea,
+ directory swhid
+);
+
+comment on table raw_extrinsic_metadata is 'keeps all metadata found concerning an object';
+comment on column raw_extrinsic_metadata.type is 'the type of object (content/directory/revision/release/snapshot/origin) the metadata is on';
+comment on column raw_extrinsic_metadata.target is 'the SWHID or origin URL for which the metadata was found';
+comment on column raw_extrinsic_metadata.discovery_date is 'the date of retrieval';
+comment on column raw_extrinsic_metadata.authority_id is 'the metadata provider: github, openhub, deposit, etc.';
+comment on column raw_extrinsic_metadata.fetcher_id is 'the tool used for extracting metadata: loaders, crawlers, etc.';
+comment on column raw_extrinsic_metadata.format is 'name of the format of metadata, used by readers to interpret it.';
+comment on column raw_extrinsic_metadata.metadata is 'original metadata in opaque format';
+
+
+-- Keep a cache of object counts
+create table object_counts
+(
+ object_type text, -- table for which we're counting objects (PK)
+ value bigint, -- count of objects in the table
+ last_update timestamptz, -- last update for the object count in this table
+ single_update boolean -- whether we update this table standalone (true) or through bucketed counts (false)
+);
+
+comment on table object_counts is 'Cache of object counts';
+comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ''revision'',
+ ''release'', ''snapshot'')';
+comment on column object_counts.value is 'Count of objects in the table';
+comment on column object_counts.last_update is 'Last update for object count';
+comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)';
+
+
+create table object_counts_bucketed
+(
+ line serial not null, -- PK
+ object_type text not null, -- table for which we're counting objects
+ identifier text not null, -- identifier across which we're bucketing objects
+ bucket_start bytea, -- lower bound (inclusive) for the bucket
+ bucket_end bytea, -- upper bound (exclusive) for the bucket
+ value bigint, -- count of objects in the bucket
+ last_update timestamptz -- last update for the object count in this bucket
+);
+
+comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type';
+comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value';
+comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ''revision'',
+ ''release'', ''snapshot'')';
+comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects';
+comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket';
+comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket';
+comment on column object_counts_bucketed.value is 'Count of objects in the bucket';
+comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket';
diff --git a/swh/storage/tests/data/sql-v0.18.0/40-funcs.sql b/swh/storage/tests/data/sql-v0.18.0/40-funcs.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/40-funcs.sql
@@ -0,0 +1,960 @@
+create or replace function hash_sha1(text)
+ returns text
+as $$
+ select encode(digest($1, 'sha1'), 'hex')
+$$ language sql strict immutable;
+
+comment on function hash_sha1(text) is 'Compute SHA1 hash as text';
+
+-- create a temporary table called tmp_TBLNAME, mimicking existing table
+-- TBLNAME
+--
+-- Args:
+-- tblname: name of the table to mimic
+create or replace function swh_mktemp(tblname regclass)
+ returns void
+ language plpgsql
+as $$
+begin
+ execute format('
+ create temporary table if not exists tmp_%1$I
+ (like %1$I including defaults)
+ on commit delete rows;
+ alter table tmp_%1$I drop column if exists object_id;
+ ', tblname);
+ return;
+end
+$$;
+
+-- create a temporary table for directory entries called tmp_TBLNAME,
+-- mimicking existing table TBLNAME with an extra dir_id (sha1_git)
+-- column, and dropping the id column.
+--
+-- This is used to create the tmp_directory_entry_<foo> tables.
+--
+-- Args:
+-- tblname: name of the table to mimic
+create or replace function swh_mktemp_dir_entry(tblname regclass)
+ returns void
+ language plpgsql
+as $$
+begin
+ execute format('
+ create temporary table if not exists tmp_%1$I
+ (like %1$I including defaults, dir_id sha1_git)
+ on commit delete rows;
+ alter table tmp_%1$I drop column if exists id;
+ ', tblname);
+ return;
+end
+$$;
+
+-- create a temporary table for revisions called tmp_revisions,
+-- mimicking existing table revision, replacing the foreign keys to
+-- people with an email and name field
+--
+create or replace function swh_mktemp_revision()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_revision (
+ like revision including defaults,
+ author_fullname bytea,
+ author_name bytea,
+ author_email bytea,
+ committer_fullname bytea,
+ committer_name bytea,
+ committer_email bytea
+ ) on commit delete rows;
+ alter table tmp_revision drop column if exists author;
+ alter table tmp_revision drop column if exists committer;
+ alter table tmp_revision drop column if exists object_id;
+$$;
+
+-- create a temporary table for releases called tmp_release,
+-- mimicking existing table release, replacing the foreign keys to
+-- people with an email and name field
+--
+create or replace function swh_mktemp_release()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_release (
+ like release including defaults,
+ author_fullname bytea,
+ author_name bytea,
+ author_email bytea
+ ) on commit delete rows;
+ alter table tmp_release drop column if exists author;
+ alter table tmp_release drop column if exists object_id;
+$$;
+
+-- create a temporary table for the branches of a snapshot
+create or replace function swh_mktemp_snapshot_branch()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_snapshot_branch (
+ name bytea not null,
+ target bytea,
+ target_type snapshot_target
+ ) on commit delete rows;
+$$;
+
+-- a content signature is a set of cryptographic checksums that we use to
+-- uniquely identify content, for the purpose of verifying if we already have
+-- some content or not during content injection
+create type content_signature as (
+ sha1 sha1,
+ sha1_git sha1_git,
+ sha256 sha256,
+ blake2s256 blake2s256
+);
+
+
+-- check which entries of tmp_skipped_content are missing from skipped_content
+--
+-- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
+-- 2. call this function
+create or replace function swh_skipped_content_missing()
+ returns setof content_signature
+ language plpgsql
+as $$
+begin
+ return query
+ select sha1, sha1_git, sha256, blake2s256 from tmp_skipped_content t
+ where not exists
+ (select 1 from skipped_content s where
+ s.sha1 is not distinct from t.sha1 and
+ s.sha1_git is not distinct from t.sha1_git and
+ s.sha256 is not distinct from t.sha256);
+ return;
+end
+$$;
+
+
+-- add tmp_content entries to content, skipping duplicates
+--
+-- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content,
+-- 2. call this function
+create or replace function swh_content_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ insert into content (sha1, sha1_git, sha256, blake2s256, length, status, ctime)
+ select distinct sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content;
+ return;
+end
+$$;
+
+
+-- add tmp_skipped_content entries to skipped_content, skipping duplicates
+--
+-- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
+-- 2. call this function
+create or replace function swh_skipped_content_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ insert into skipped_content (sha1, sha1_git, sha256, blake2s256, length, status, reason, origin)
+ select distinct sha1, sha1_git, sha256, blake2s256, length, status, reason, origin
+ from tmp_skipped_content
+ where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in (
+ select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')
+ from swh_skipped_content_missing()
+ );
+ -- TODO XXX use postgres 9.5 "UPSERT" support here, when available.
+ -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid
+ -- the extra swh_skipped_content_missing() query here.
+ return;
+end
+$$;
+
+-- Update content entries from temporary table.
+-- (columns are potential new columns added to the schema, this cannot be empty)
+--
+create or replace function swh_content_update(columns_update text[])
+ returns void
+ language plpgsql
+as $$
+declare
+ query text;
+ tmp_array text[];
+begin
+ if array_length(columns_update, 1) = 0 then
+ raise exception 'Please, provide the list of column names to update.';
+ end if;
+
+ tmp_array := array(select format('%1$s=t.%1$s', unnest) from unnest(columns_update));
+
+ query = format('update content set %s
+ from tmp_content t where t.sha1 = content.sha1',
+ array_to_string(tmp_array, ', '));
+
+ execute query;
+
+ return;
+end
+$$;
+
+comment on function swh_content_update(text[]) IS 'Update existing content''s columns';
+
+
+create type directory_entry_type as enum('file', 'dir', 'rev');
+
+
+-- Add tmp_directory_entry_* entries to directory_entry_* and directory,
+-- skipping duplicates in directory_entry_*. This is a generic function that
+-- works on all kind of directory entries.
+--
+-- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_*'), 1 COPY to
+-- tmp_directory_entry_*, 2. call this function
+--
+-- Assumption: this function is used in the same transaction that inserts the
+-- context directory in table "directory".
+create or replace function swh_directory_entry_add(typ directory_entry_type)
+ returns void
+ language plpgsql
+as $$
+begin
+ execute format('
+ insert into directory_entry_%1$s (target, name, perms)
+ select distinct t.target, t.name, t.perms
+ from tmp_directory_entry_%1$s t
+ where not exists (
+ select 1
+ from directory_entry_%1$s i
+ where t.target = i.target and t.name = i.name and t.perms = i.perms)
+ ', typ);
+
+ execute format('
+ with new_entries as (
+ select t.dir_id, array_agg(i.id) as entries
+ from tmp_directory_entry_%1$s t
+ inner join directory_entry_%1$s i
+ using (target, name, perms)
+ group by t.dir_id
+ )
+ update tmp_directory as d
+ set %1$s_entries = new_entries.entries
+ from new_entries
+ where d.id = new_entries.dir_id
+ ', typ);
+
+ return;
+end
+$$;
+
+-- Insert the data from tmp_directory, tmp_directory_entry_file,
+-- tmp_directory_entry_dir, tmp_directory_entry_rev into their final
+-- tables.
+--
+-- Prerequisites:
+-- directory ids in tmp_directory
+-- entries in tmp_directory_entry_{file,dir,rev}
+--
+create or replace function swh_directory_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_directory_entry_add('file');
+ perform swh_directory_entry_add('dir');
+ perform swh_directory_entry_add('rev');
+
+ insert into directory
+ select * from tmp_directory t
+ where not exists (
+ select 1 from directory d
+ where d.id = t.id);
+
+ return;
+end
+$$;
+
+-- a directory listing entry with all the metadata
+--
+-- can be used to list a directory, and retrieve all the data in one go.
+create type directory_entry as
+(
+ dir_id sha1_git, -- id of the parent directory
+ type directory_entry_type, -- type of entry
+ target sha1_git, -- id of target
+ name unix_path, -- path name, relative to containing dir
+ perms file_perms, -- unix-like permissions
+ status content_status, -- visible or absent
+ sha1 sha1, -- content if sha1 if type is not dir
+ sha1_git sha1_git, -- content's sha1 git if type is not dir
+ sha256 sha256, -- content's sha256 if type is not dir
+ length bigint -- content length if type is not dir
+);
+
+
+-- List a single level of directory walked_dir_id
+-- FIXME: order by name is not correct. For git, we need to order by
+-- lexicographic order but as if a trailing / is present in directory
+-- name
+create or replace function swh_directory_walk_one(walked_dir_id sha1_git)
+ returns setof directory_entry
+ language sql
+ stable
+as $$
+ with dir as (
+ select id as dir_id, dir_entries, file_entries, rev_entries
+ from directory
+ where id = walked_dir_id),
+ ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir),
+ ls_f as (select dir_id, unnest(file_entries) as entry_id from dir),
+ ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir)
+ (select dir_id, 'dir'::directory_entry_type as type,
+ e.target, e.name, e.perms, NULL::content_status,
+ NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint
+ from ls_d
+ left join directory_entry_dir e on ls_d.entry_id = e.id)
+ union
+ (with known_contents as
+ (select dir_id, 'file'::directory_entry_type as type,
+ e.target, e.name, e.perms, c.status,
+ c.sha1, c.sha1_git, c.sha256, c.length
+ from ls_f
+ left join directory_entry_file e on ls_f.entry_id = e.id
+ inner join content c on e.target = c.sha1_git)
+ select * from known_contents
+ union
+ (select dir_id, 'file'::directory_entry_type as type,
+ e.target, e.name, e.perms, c.status,
+ c.sha1, c.sha1_git, c.sha256, c.length
+ from ls_f
+ left join directory_entry_file e on ls_f.entry_id = e.id
+ left join skipped_content c on e.target = c.sha1_git
+ where not exists (select 1 from known_contents where known_contents.sha1_git=e.target)))
+ union
+ (select dir_id, 'rev'::directory_entry_type as type,
+ e.target, e.name, e.perms, NULL::content_status,
+ NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint
+ from ls_r
+ left join directory_entry_rev e on ls_r.entry_id = e.id)
+ order by name;
+$$;
+
+-- List recursively the revision directory arborescence
+create or replace function swh_directory_walk(walked_dir_id sha1_git)
+ returns setof directory_entry
+ language sql
+ stable
+as $$
+ with recursive entries as (
+ select dir_id, type, target, name, perms, status, sha1, sha1_git,
+ sha256, length
+ from swh_directory_walk_one(walked_dir_id)
+ union all
+ select dir_id, type, target, (dirname || '/' || name)::unix_path as name,
+ perms, status, sha1, sha1_git, sha256, length
+ from (select (swh_directory_walk_one(dirs.target)).*, dirs.name as dirname
+ from (select target, name from entries where type = 'dir') as dirs) as with_parent
+ )
+ select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256, length
+ from entries
+$$;
+
+-- Find a directory entry by its path
+create or replace function swh_find_directory_entry_by_path(
+ walked_dir_id sha1_git,
+ dir_or_content_path bytea[])
+ returns directory_entry
+ language plpgsql
+as $$
+declare
+ end_index integer;
+ paths bytea default '';
+ path bytea;
+ res bytea[];
+ r record;
+begin
+ end_index := array_upper(dir_or_content_path, 1);
+ res[1] := walked_dir_id;
+
+ for i in 1..end_index
+ loop
+ path := dir_or_content_path[i];
+ -- concatenate path for patching the name in the result record (if we found it)
+ if i = 1 then
+ paths = path;
+ else
+ paths := paths || '/' || path; -- concatenate paths
+ end if;
+
+ if i <> end_index then
+ select *
+ from swh_directory_walk_one(res[i] :: sha1_git)
+ where name=path
+ and type = 'dir'
+ limit 1 into r;
+ else
+ select *
+ from swh_directory_walk_one(res[i] :: sha1_git)
+ where name=path
+ limit 1 into r;
+ end if;
+
+ -- find the path
+ if r is null then
+ return null;
+ else
+ -- store the next dir to lookup the next local path from
+ res[i+1] := r.target;
+ end if;
+ end loop;
+
+ -- at this moment, r is the result. Patch its 'name' with the full path before returning it.
+ r.name := paths;
+ return r;
+end
+$$;
+
+-- List all revision IDs starting from a given revision, going back in time
+--
+-- TODO ordering: should be breadth-first right now (what do we want?)
+-- TODO ordering: ORDER BY parent_rank somewhere?
+create or replace function swh_revision_list(root_revisions bytea[], num_revs bigint default NULL)
+ returns table (id sha1_git, parents bytea[])
+ language sql
+ stable
+as $$
+ with recursive full_rev_list(id) as (
+ (select id from revision where id = ANY(root_revisions))
+ union
+ (select h.parent_id
+ from revision_history as h
+ join full_rev_list on h.id = full_rev_list.id)
+ ),
+ rev_list as (select id from full_rev_list limit num_revs)
+ select rev_list.id as id,
+ array(select rh.parent_id::bytea
+ from revision_history rh
+ where rh.id = rev_list.id
+ order by rh.parent_rank
+ ) as parent
+ from rev_list;
+$$;
+
+
+-- Detailed entry for a revision
+create type revision_entry as
+(
+ id sha1_git,
+ date timestamptz,
+ date_offset smallint,
+ date_neg_utc_offset boolean,
+ committer_date timestamptz,
+ committer_date_offset smallint,
+ committer_date_neg_utc_offset boolean,
+ type revision_type,
+ directory sha1_git,
+ message bytea,
+ author_id bigint,
+ author_fullname bytea,
+ author_name bytea,
+ author_email bytea,
+ committer_id bigint,
+ committer_fullname bytea,
+ committer_name bytea,
+ committer_email bytea,
+ metadata jsonb,
+ synthetic boolean,
+ parents bytea[],
+ object_id bigint,
+ extra_headers bytea[][]
+);
+
+
+-- "git style" revision log. Similar to swh_revision_list(), but returning all
+-- information associated to each revision, and expanding authors/committers
+create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email,
+ c.id, c.fullname, c.name, c.email,
+ r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers
+ from swh_revision_list(root_revisions, num_revs) as t
+ left join revision r on t.id = r.id
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer;
+$$;
+
+
+-- Detailed entry for a release
+create type release_entry as
+(
+ id sha1_git,
+ target sha1_git,
+ target_type object_type,
+ date timestamptz,
+ date_offset smallint,
+ date_neg_utc_offset boolean,
+ name bytea,
+ comment bytea,
+ synthetic boolean,
+ author_id bigint,
+ author_fullname bytea,
+ author_name bytea,
+ author_email bytea,
+ object_id bigint
+);
+
+-- Create entries in person from tmp_revision
+create or replace function swh_person_add_from_revision()
+ returns void
+ language plpgsql
+as $$
+begin
+ with t as (
+ select author_fullname as fullname, author_name as name, author_email as email from tmp_revision
+ union
+ select committer_fullname as fullname, committer_name as name, committer_email as email from tmp_revision
+ ) insert into person (fullname, name, email)
+ select distinct on (fullname) fullname, name, email from t
+ where not exists (
+ select 1
+ from person p
+ where t.fullname = p.fullname
+ );
+ return;
+end
+$$;
+
+
+-- Create entries in revision from tmp_revision
+create or replace function swh_revision_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_person_add_from_revision();
+
+ insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
+ from tmp_revision t
+ left join person a on a.fullname = t.author_fullname
+ left join person c on c.fullname = t.committer_fullname;
+ return;
+end
+$$;
+
+
+-- Create entries in person from tmp_release
+create or replace function swh_person_add_from_release()
+ returns void
+ language plpgsql
+as $$
+begin
+ with t as (
+ select distinct author_fullname as fullname, author_name as name, author_email as email from tmp_release
+ where author_fullname is not null
+ ) insert into person (fullname, name, email)
+ select distinct on (fullname) fullname, name, email from t
+ where not exists (
+ select 1
+ from person p
+ where t.fullname = p.fullname
+ );
+ return;
+end
+$$;
+
+
+-- Create entries in release from tmp_release
+create or replace function swh_release_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_person_add_from_release();
+
+ insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic)
+ select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic
+ from tmp_release t
+ left join person a on a.fullname = t.author_fullname
+ where not exists (select 1 from release where t.id = release.id);
+ return;
+end
+$$;
+
+
+-- add a new origin_visit for origin origin_id at date.
+--
+-- Returns the new visit id.
+create or replace function swh_origin_visit_add(origin_url text, date timestamptz, type text)
+ returns bigint
+ language sql
+as $$
+ with origin_id as (
+ select id
+ from origin
+ where url = origin_url
+ ), last_known_visit as (
+ select coalesce(max(visit), 0) as visit
+ from origin_visit
+ where origin = (select id from origin_id)
+ )
+ insert into origin_visit (origin, date, type, visit)
+ values ((select id from origin_id), date, type,
+ (select visit from last_known_visit) + 1)
+ returning visit;
+$$;
+
+create or replace function swh_snapshot_add(snapshot_id sha1_git)
+ returns void
+ language plpgsql
+as $$
+declare
+ snapshot_object_id snapshot.object_id%type;
+begin
+ select object_id from snapshot where id = snapshot_id into snapshot_object_id;
+ if snapshot_object_id is null then
+ insert into snapshot (id) values (snapshot_id) returning object_id into snapshot_object_id;
+ insert into snapshot_branch (name, target_type, target)
+ select name, target_type, target from tmp_snapshot_branch tmp
+ where not exists (
+ select 1
+ from snapshot_branch sb
+ where sb.name = tmp.name
+ and sb.target = tmp.target
+ and sb.target_type = tmp.target_type
+ )
+ on conflict do nothing;
+ insert into snapshot_branches (snapshot_id, branch_id)
+ select snapshot_object_id, sb.object_id as branch_id
+ from tmp_snapshot_branch tmp
+ join snapshot_branch sb
+ using (name, target, target_type)
+ where tmp.target is not null and tmp.target_type is not null
+ union
+ select snapshot_object_id, sb.object_id as branch_id
+ from tmp_snapshot_branch tmp
+ join snapshot_branch sb
+ using (name)
+ where tmp.target is null and tmp.target_type is null
+ and sb.target is null and sb.target_type is null;
+ end if;
+ truncate table tmp_snapshot_branch;
+end;
+$$;
+
+create type snapshot_result as (
+ snapshot_id sha1_git,
+ name bytea,
+ target bytea,
+ target_type snapshot_target
+);
+
+create or replace function swh_snapshot_get_by_id(id sha1_git,
+ branches_from bytea default '', branches_count bigint default null,
+ target_types snapshot_target[] default NULL)
+ returns setof snapshot_result
+ language sql
+ stable
+as $$
+ -- with small limits, the "naive" version of this query can degenerate into
+ -- using the deduplication index on snapshot_branch (name, target,
+ -- target_type); The planner happily scans several hundred million rows.
+
+ -- Do the query in two steps: first pull the relevant branches for the given
+ -- snapshot (filtering them by type), then do the limiting. This two-step
+ -- process guides the planner into using the proper index.
+ with filtered_snapshot_branches as (
+ select swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
+ from snapshot_branches
+ inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
+ where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
+ and (target_types is null or target_type = any(target_types))
+ order by name
+ )
+ select snapshot_id, name, target, target_type
+ from filtered_snapshot_branches
+ where name >= branches_from
+ order by name limit branches_count;
+$$;
+
+create type snapshot_size as (
+ target_type snapshot_target,
+ count bigint
+);
+
+create or replace function swh_snapshot_count_branches(id sha1_git)
+ returns setof snapshot_size
+ language sql
+ stable
+as $$
+ SELECT target_type, count(name)
+ from swh_snapshot_get_by_id(swh_snapshot_count_branches.id)
+ group by target_type;
+$$;
+
+-- Absolute path: directory reference + complete path relative to it
+create type content_dir as (
+ directory sha1_git,
+ path unix_path
+);
+
+
+-- Find the containing directory of a given content, specified by sha1
+-- (note: *not* sha1_git).
+--
+-- Return a pair (dir_it, path) where path is a UNIX path that, from the
+-- directory root, reach down to a file with the desired content. Return NULL
+-- if no match is found.
+--
+-- In case of multiple paths (i.e., pretty much always), an arbitrary one is
+-- chosen.
+create or replace function swh_content_find_directory(content_id sha1)
+ returns content_dir
+ language sql
+ stable
+as $$
+ with recursive path as (
+ -- Recursively build a path from the requested content to a root
+ -- directory. Each iteration returns a pair (dir_id, filename) where
+ -- filename is relative to dir_id. Stops when no parent directory can
+ -- be found.
+ (select dir.id as dir_id, dir_entry_f.name as name, 0 as depth
+ from directory_entry_file as dir_entry_f
+ join content on content.sha1_git = dir_entry_f.target
+ join directory as dir on dir.file_entries @> array[dir_entry_f.id]
+ where content.sha1 = content_id
+ limit 1)
+ union all
+ (select dir.id as dir_id,
+ (dir_entry_d.name || '/' || path.name)::unix_path as name,
+ path.depth + 1
+ from path
+ join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id
+ join directory as dir on dir.dir_entries @> array[dir_entry_d.id]
+ limit 1)
+ )
+ select dir_id, name from path order by depth desc limit 1;
+$$;
+
+-- Find the visit of origin closest to date visit_date
+-- Breaks ties by selecting the largest visit id
+create or replace function swh_visit_find_by_date(origin_url text, visit_date timestamptz default NOW())
+ returns setof origin_visit
+ language plpgsql
+ stable
+as $$
+declare
+ origin_id bigint;
+begin
+ select id into origin_id from origin where url=origin_url;
+ return query
+ with closest_two_visits as ((
+ select ov, (date - visit_date), visit as interval
+ from origin_visit ov
+ where ov.origin = origin_id
+ and ov.date >= visit_date
+ order by ov.date asc, ov.visit desc
+ limit 1
+ ) union (
+ select ov, (visit_date - date), visit as interval
+ from origin_visit ov
+ where ov.origin = origin_id
+ and ov.date < visit_date
+ order by ov.date desc, ov.visit desc
+ limit 1
+ )) select (ov).* from closest_two_visits order by interval, visit limit 1;
+end
+$$;
+
+-- Object listing by object_id
+
+create or replace function swh_content_list_by_object_id(
+ min_excl bigint,
+ max_incl bigint
+)
+ returns setof content
+ language sql
+ stable
+as $$
+ select * from content
+ where object_id > min_excl and object_id <= max_incl
+ order by object_id;
+$$;
+
+create or replace function swh_revision_list_by_object_id(
+ min_excl bigint,
+ max_incl bigint
+)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ with revs as (
+ select * from revision
+ where object_id > min_excl and object_id <= max_incl
+ )
+ select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
+ array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
+ as parents, r.object_id, r.extra_headers
+ from revs r
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer
+ order by r.object_id;
+$$;
+
+create or replace function swh_release_list_by_object_id(
+ min_excl bigint,
+ max_incl bigint
+)
+ returns setof release_entry
+ language sql
+ stable
+as $$
+ with rels as (
+ select * from release
+ where object_id > min_excl and object_id <= max_incl
+ )
+ select r.id, r.target, r.target_type, r.date, r.date_offset, r.date_neg_utc_offset, r.name, r.comment,
+ r.synthetic, p.id as author_id, p.fullname as author_fullname, p.name as author_name, p.email as author_email, r.object_id
+ from rels r
+ left join person p on p.id = r.author
+ order by r.object_id;
+$$;
+
+
+-- simple counter mapping a textual label to an integer value
+create type counter as (
+ label text,
+ value bigint
+);
+
+-- return statistics about the number of tuples in various SWH tables
+--
+-- Note: the returned values are based on postgres internal statistics
+-- (pg_class table), which are only updated daily (by autovacuum) or so
+create or replace function swh_stat_counters()
+ returns setof counter
+ language sql
+ stable
+as $$
+ select object_type as label, value as value
+ from object_counts
+ where object_type in (
+ 'content',
+ 'directory',
+ 'directory_entry_dir',
+ 'directory_entry_file',
+ 'directory_entry_rev',
+ 'origin',
+ 'origin_visit',
+ 'person',
+ 'release',
+ 'revision',
+ 'revision_history',
+ 'skipped_content',
+ 'snapshot'
+ );
+$$;
+
+create or replace function swh_update_counter(object_type text)
+ returns void
+ language plpgsql
+as $$
+begin
+ execute format('
+ insert into object_counts
+ (value, last_update, object_type)
+ values
+ ((select count(*) from %1$I), NOW(), %1$L)
+ on conflict (object_type) do update set
+ value = excluded.value,
+ last_update = excluded.last_update',
+ object_type);
+ return;
+end;
+$$;
+
+create or replace function swh_update_counter_bucketed()
+ returns void
+ language plpgsql
+as $$
+declare
+ query text;
+ line_to_update int;
+ new_value bigint;
+begin
+ select
+ object_counts_bucketed.line,
+ format(
+ 'select count(%I) from %I where %s',
+ coalesce(identifier, '*'),
+ object_type,
+ coalesce(
+ concat_ws(
+ ' and ',
+ case when bucket_start is not null then
+ format('%I >= %L', identifier, bucket_start) -- lower bound condition, inclusive
+ end,
+ case when bucket_end is not null then
+ format('%I < %L', identifier, bucket_end) -- upper bound condition, exclusive
+ end
+ ),
+ 'true'
+ )
+ )
+ from object_counts_bucketed
+ order by coalesce(last_update, now() - '1 month'::interval) asc
+ limit 1
+ into line_to_update, query;
+
+ execute query into new_value;
+
+ update object_counts_bucketed
+ set value = new_value,
+ last_update = now()
+ where object_counts_bucketed.line = line_to_update;
+
+END
+$$;
+
+create or replace function swh_update_counters_from_buckets()
+ returns trigger
+ language plpgsql
+as $$
+begin
+with to_update as (
+ select object_type, sum(value) as value, max(last_update) as last_update
+ from object_counts_bucketed ob1
+ where not exists (
+ select 1 from object_counts_bucketed ob2
+ where ob1.object_type = ob2.object_type
+ and value is null
+ )
+ group by object_type
+) update object_counts
+ set
+ value = to_update.value,
+ last_update = to_update.last_update
+ from to_update
+ where
+ object_counts.object_type = to_update.object_type
+ and object_counts.value != to_update.value;
+return null;
+end
+$$;
+
+create trigger update_counts_from_bucketed
+ after insert or update
+ on object_counts_bucketed
+ for each row
+ when (NEW.line % 256 = 0)
+ execute procedure swh_update_counters_from_buckets();
diff --git a/swh/storage/tests/data/sql-v0.18.0/60-indexes.sql b/swh/storage/tests/data/sql-v0.18.0/60-indexes.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/60-indexes.sql
@@ -0,0 +1,283 @@
+-- psql variables to get the current database flavor
+
+select swh_get_dbflavor() = 'read_replica' as dbflavor_read_replica \gset
+select swh_get_dbflavor() != 'read_replica' as dbflavor_does_deduplication \gset
+select swh_get_dbflavor() = 'mirror' as dbflavor_mirror \gset
+select swh_get_dbflavor() = 'default' as dbflavor_default \gset
+
+-- content
+
+create unique index concurrently content_pkey on content(sha1);
+alter table content add primary key using index content_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on content(sha1_git);
+\else
+ create index concurrently on content(sha1_git);
+\endif
+
+create index concurrently on content(sha256);
+create index concurrently on content(blake2s256);
+
+\if :dbflavor_default
+ create unique index concurrently on content(object_id); -- to be reviewed
+ create index concurrently on content(ctime); -- to be reviewed
+\endif
+
+-- origin
+
+create unique index concurrently origin_pkey on origin(id);
+alter table origin add primary key using index origin_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on origin using btree(url);
+\else
+ create index concurrently on origin using btree(url);
+\endif
+
+create index concurrently on origin using gin (url gin_trgm_ops);
+create index concurrently on origin using btree(digest(url, 'sha1'));
+
+
+-- skipped_content
+
+\if :dbflavor_does_deduplication
+ alter table skipped_content add constraint skipped_content_sha1_sha1_git_sha256_key unique (sha1, sha1_git, sha256);
+\endif
+
+create index concurrently on skipped_content(sha1);
+create index concurrently on skipped_content(sha1_git);
+create index concurrently on skipped_content(sha256);
+create index concurrently on skipped_content(blake2s256);
+create unique index concurrently on skipped_content(object_id);
+
+\if :dbflavor_default
+ alter table skipped_content add constraint skipped_content_origin_fkey foreign key (origin) references origin(id) not valid;
+ alter table skipped_content validate constraint skipped_content_origin_fkey;
+\endif
+
+-- directory
+create unique index concurrently directory_pkey on directory(id);
+alter table directory add primary key using index directory_pkey;
+
+\if :dbflavor_default
+ create index concurrently on directory using gin (dir_entries); -- to be reviewed
+ create index concurrently on directory using gin (file_entries); -- to be reviewed
+ create index concurrently on directory using gin (rev_entries); -- to be reviewed
+ create unique index concurrently on directory(object_id); -- to be reviewed
+\endif
+
+-- directory_entry_dir
+
+create unique index concurrently directory_entry_dir_pkey on directory_entry_dir(id);
+alter table directory_entry_dir add primary key using index directory_entry_dir_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on directory_entry_dir(target, name, perms);
+\endif
+
+-- directory_entry_file
+
+create unique index concurrently directory_entry_file_pkey on directory_entry_file(id);
+alter table directory_entry_file add primary key using index directory_entry_file_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on directory_entry_file(target, name, perms);
+\endif
+
+-- directory_entry_rev
+
+create unique index concurrently directory_entry_rev_pkey on directory_entry_rev(id);
+alter table directory_entry_rev add primary key using index directory_entry_rev_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on directory_entry_rev(target, name, perms);
+\endif
+
+
+-- person
+create unique index concurrently person_pkey on person(id);
+alter table person add primary key using index person_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on person(fullname);
+\else
+ create index concurrently on person(fullname); -- to be reviewed
+\endif
+
+\if :dbflavor_default
+ create index concurrently on person(name); -- to be reviewed
+ create index concurrently on person(email); -- to be reviewed
+\endif
+
+-- revision
+create unique index concurrently revision_pkey on revision(id);
+alter table revision add primary key using index revision_pkey;
+
+\if :dbflavor_does_deduplication
+ alter table revision add constraint revision_author_fkey foreign key (author) references person(id) not valid;
+ alter table revision validate constraint revision_author_fkey;
+ alter table revision add constraint revision_committer_fkey foreign key (committer) references person(id) not valid;
+ alter table revision validate constraint revision_committer_fkey;
+
+ alter table revision
+ add constraint revision_date_neg_utc_offset_not_null
+ check (date is null or date_neg_utc_offset is not null)
+ not valid;
+ alter table revision
+ add constraint revision_committer_date_neg_utc_offset_not_null
+ check (committer_date is null or committer_date_neg_utc_offset is not null)
+ not valid;
+
+ alter table revision
+ validate constraint revision_date_neg_utc_offset_not_null;
+ alter table revision
+ validate constraint revision_committer_date_neg_utc_offset_not_null;
+\endif
+
+\if :dbflavor_default
+ create index concurrently on revision(directory); -- to be reviewed
+ create unique index concurrently on revision(object_id); -- to be reviewed
+\endif
+
+-- revision_history
+create unique index concurrently revision_history_pkey on revision_history(id, parent_rank);
+alter table revision_history add primary key using index revision_history_pkey;
+
+\if :dbflavor_default
+ create index concurrently on revision_history(parent_id); -- to be reviewed
+\endif
+
+\if :dbflavor_does_deduplication
+ alter table revision_history add constraint revision_history_id_fkey foreign key (id) references revision(id) not valid;
+ alter table revision_history validate constraint revision_history_id_fkey;
+\endif
+
+-- snapshot
+create unique index concurrently snapshot_pkey on snapshot(object_id);
+alter table snapshot add primary key using index snapshot_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on snapshot(id);
+\else
+ create index concurrently on snapshot(id);
+\endif
+
+-- snapshot_branch
+create unique index concurrently snapshot_branch_pkey on snapshot_branch(object_id);
+alter table snapshot_branch add primary key using index snapshot_branch_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently on snapshot_branch (target_type, target, name);
+ alter table snapshot_branch add constraint snapshot_branch_target_check check ((target_type is null) = (target is null)) not valid;
+ alter table snapshot_branch validate constraint snapshot_branch_target_check;
+ alter table snapshot_branch add constraint snapshot_target_check check (target_type not in ('content', 'directory', 'revision', 'release', 'snapshot') or length(target) = 20) not valid;
+ alter table snapshot_branch validate constraint snapshot_target_check;
+
+ create unique index concurrently on snapshot_branch (name) where target_type is null and target is null;
+\endif
+
+-- snapshot_branches
+create unique index concurrently snapshot_branches_pkey on snapshot_branches(snapshot_id, branch_id);
+alter table snapshot_branches add primary key using index snapshot_branches_pkey;
+
+\if :dbflavor_does_deduplication
+ alter table snapshot_branches add constraint snapshot_branches_snapshot_id_fkey foreign key (snapshot_id) references snapshot(object_id) not valid;
+ alter table snapshot_branches validate constraint snapshot_branches_snapshot_id_fkey;
+
+ alter table snapshot_branches add constraint snapshot_branches_branch_id_fkey foreign key (branch_id) references snapshot_branch(object_id) not valid;
+ alter table snapshot_branches validate constraint snapshot_branches_branch_id_fkey;
+\endif
+
+-- origin_visit
+create unique index concurrently origin_visit_pkey on origin_visit(origin, visit);
+alter table origin_visit add primary key using index origin_visit_pkey;
+
+\if :dbflavor_default
+ create index concurrently on origin_visit(date); -- to be reviewed
+ create index concurrently origin_visit_type_date on origin_visit(type, date); -- to be reviewed
+\endif
+
+\if :dbflavor_does_deduplication
+ alter table origin_visit add constraint origin_visit_origin_fkey foreign key (origin) references origin(id) not valid;
+ alter table origin_visit validate constraint origin_visit_origin_fkey;
+\endif
+
+-- origin_visit_status
+
+create unique index concurrently origin_visit_status_pkey on origin_visit_status(origin, visit, date);
+alter table origin_visit_status add primary key using index origin_visit_status_pkey;
+
+\if :dbflavor_default
+ alter table origin_visit_status
+ add constraint origin_visit_status_origin_visit_fkey
+ foreign key (origin, visit)
+ references origin_visit(origin, visit) not valid;
+ alter table origin_visit_status validate constraint origin_visit_status_origin_visit_fkey;
+\endif
+
+-- release
+create unique index concurrently release_pkey on release(id);
+alter table release add primary key using index release_pkey;
+
+\if :dbflavor_default
+ create index concurrently on release(target, target_type); -- to be reviewed
+ create unique index concurrently on release(object_id); -- to be reviewed
+\endif
+
+\if :dbflavor_does_deduplication
+ alter table release add constraint release_author_fkey foreign key (author) references person(id) not valid;
+ alter table release validate constraint release_author_fkey;
+
+ alter table release
+ add constraint release_date_neg_utc_offset_not_null
+ check (date is null or date_neg_utc_offset is not null)
+ not valid;
+
+ alter table release
+ validate constraint release_date_neg_utc_offset_not_null;
+
+ -- if the author is null, then the date must be null
+ alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid;
+ alter table release validate constraint release_author_date_check;
+\endif
+
+-- metadata_fetcher
+create unique index metadata_fetcher_pkey on metadata_fetcher(id);
+alter table metadata_fetcher add primary key using index metadata_fetcher_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index metadata_fetcher_name_version on metadata_fetcher(name, version);
+\else
+ create index metadata_fetcher_name_version on metadata_fetcher(name, version);
+\endif
+
+-- metadata_authority
+create unique index concurrently metadata_authority_pkey on metadata_authority(id);
+alter table metadata_authority add primary key using index metadata_authority_pkey;
+
+\if :dbflavor_does_deduplication
+ create unique index concurrently metadata_authority_type_url on metadata_authority(type, url);
+\else
+ create index concurrently metadata_authority_type_url on metadata_authority(type, url);
+\endif
+
+
+-- raw_extrinsic_metadata
+create unique index concurrently raw_extrinsic_metadata_content_authority_date_fetcher on raw_extrinsic_metadata(target, authority_id, discovery_date, fetcher_id);
+
+\if :dbflavor_default
+ alter table raw_extrinsic_metadata add constraint raw_extrinsic_metadata_authority_fkey foreign key (authority_id) references metadata_authority(id) not valid;
+ alter table raw_extrinsic_metadata validate constraint raw_extrinsic_metadata_authority_fkey;
+
+ alter table raw_extrinsic_metadata add constraint raw_extrinsic_metadata_fetcher_fkey foreign key (fetcher_id) references metadata_fetcher(id) not valid;
+ alter table raw_extrinsic_metadata validate constraint raw_extrinsic_metadata_fetcher_fkey;
+\endif
+
+-- object_counts
+create unique index concurrently object_counts_pkey on object_counts(object_type);
+alter table object_counts add primary key using index object_counts_pkey;
+
+-- object_counts_bucketed
+create unique index concurrently object_counts_bucketed_pkey on object_counts_bucketed(line);
+alter table object_counts_bucketed add primary key using index object_counts_bucketed_pkey;
diff --git a/swh/storage/tests/data/sql-v0.18.0/logical_replication/replication_source.sql b/swh/storage/tests/data/sql-v0.18.0/logical_replication/replication_source.sql
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/data/sql-v0.18.0/logical_replication/replication_source.sql
@@ -0,0 +1,25 @@
+-- This file contains the instructions to create a replication source for
+-- PostgreSQL logical replication to another database.
+
+CREATE PUBLICATION softwareheritage;
+
+ALTER PUBLICATION softwareheritage ADD TABLE content;
+ALTER PUBLICATION softwareheritage ADD TABLE skipped_content;
+ALTER PUBLICATION softwareheritage ADD TABLE directory;
+ALTER PUBLICATION softwareheritage ADD TABLE directory_entry_file;
+ALTER PUBLICATION softwareheritage ADD TABLE directory_entry_dir;
+ALTER PUBLICATION softwareheritage ADD TABLE directory_entry_rev;
+ALTER PUBLICATION softwareheritage ADD TABLE person;
+ALTER PUBLICATION softwareheritage ADD TABLE revision;
+ALTER PUBLICATION softwareheritage ADD TABLE revision_history;
+ALTER PUBLICATION softwareheritage ADD TABLE release;
+ALTER PUBLICATION softwareheritage ADD TABLE snapshot;
+ALTER PUBLICATION softwareheritage ADD TABLE snapshot_branch;
+ALTER PUBLICATION softwareheritage ADD TABLE snapshot_branches;
+ALTER PUBLICATION softwareheritage ADD TABLE origin;
+ALTER PUBLICATION softwareheritage ADD TABLE origin_visit;
+ALTER PUBLICATION softwareheritage ADD TABLE origin_visit_status;
+ALTER PUBLICATION softwareheritage ADD TABLE metadata_fetcher;
+ALTER PUBLICATION softwareheritage ADD TABLE metadata_authority;
+ALTER PUBLICATION softwareheritage ADD TABLE raw_extrinsic_metadata;
+ALTER PUBLICATION softwareheritage ADD TABLE object_counts;
diff --git a/swh/storage/tests/test_postgresql_migrated.py b/swh/storage/tests/test_postgresql_migrated.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_postgresql_migrated.py
@@ -0,0 +1,63 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Tests postgresql migrations by initializing with an old schema, applying migrations,
+then running all the tests."""
+
+import glob
+from os import path
+
+import pytest
+import pytest_postgresql
+
+from swh.core.db.pytest_plugin import postgresql_fact
+from swh.core.utils import numfile_sortkey as sortkey
+import swh.storage
+from swh.storage.tests.storage_tests import TestStorage # noqa
+
+BASE_DIR = path.dirname(swh.storage.__file__)
+SQL_UPGRADES_DIR = path.join(BASE_DIR, "../../sql/upgrades")
+
+
+PRE_MIGRATION_SCHEMA_DIR = "sql-v0.18.0"
+"""swh/storage/tests/data/{PRE_MIGRATION_SCHEMA_DIR}/ should be a copy of
+swh/storage/sql/ from a previous release."""
+
+BASE_DBVERSION = 164
+"""dbversion in swh/storage/tests/data/{PRE_MIGRATION_SCHEMA_DIR}/30_schema.sql"""
+
+pre_migration_schema_files = sorted(
+ glob.glob(path.join(BASE_DIR, "tests/data", PRE_MIGRATION_SCHEMA_DIR, "*.sql"))
+)
+
+migration_files = sorted(glob.glob(path.join(SQL_UPGRADES_DIR, "*.sql")), key=sortkey,)
+"""All migration files."""
+
+use_migration_files = [
+ filename
+ for filename in migration_files
+ if int(path.splitext(path.basename(filename))[0]) > BASE_DBVERSION
+]
+"""Migration files used to go from BASE_DBVERSION to the current dbversion."""
+
+migrated_swh_storage_postgresql = postgresql_fact(
+ "postgresql_proc_migrated",
+ db_name="storage",
+ dump_files=pre_migration_schema_files + use_migration_files,
+)
+
+postgresql_proc_migrated = pytest_postgresql.factories.postgresql_proc()
+"""Like postgresql_proc, but initialized with the old schema + migration files,
+instead of directly with the current schema."""
+
+
+@pytest.fixture
+def swh_storage_backend_config(
+ swh_storage_backend_config, migrated_swh_storage_postgresql
+):
+ yield {
+ **swh_storage_backend_config,
+ "db": migrated_swh_storage_postgresql.dsn,
+ }
diff --git a/swh/storage/tests/test_postgresql_migration.py b/swh/storage/tests/test_postgresql_migration.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_postgresql_migration.py
@@ -0,0 +1,194 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import glob
+import os
+import subprocess
+
+import attr
+import pytest
+from pytest_postgresql import factories
+
+from swh.core.utils import numfile_sortkey as sortkey
+from swh.storage import get_storage
+
+from .storage_tests import transform_entries
+
+DIR = os.path.dirname(__file__)
+
+
+pg_storage_migration_proc = factories.postgresql_proc()
+pg_storage_migration = factories.postgresql("pg_storage_migration_proc")
+
+
+def psql_run_file(dsn, filename):
+ subprocess.check_call(
+ [
+ "psql",
+ "--quiet",
+ "--no-psqlrc",
+ "-v",
+ "ON_ERROR_STOP=1",
+ "-d",
+ dsn,
+ "-f",
+ filename,
+ ]
+ )
+
+
+@pytest.fixture
+def storage(pg_storage_migration):
+ for filename in sorted(
+ glob.glob(os.path.join(DIR, "data", "sql-v0.18.0", "*.sql"))
+ ):
+ psql_run_file(pg_storage_migration.dsn, filename)
+
+ config = {
+ "cls": "local",
+ "db": pg_storage_migration.dsn,
+ "objstorage": {"cls": "memory"},
+ "check_config": False, # it would error on the dbversion number
+ }
+ return get_storage(**config)
+
+
+@pytest.mark.db
+class TestPgStorageMigration:
+ """Creates an old schema, inserts some data, runs migrations, and checks the
+ data still exists."""
+
+ def _migrate(self, db):
+ current_version = db.dbversion()["version"]
+
+ filenames = sorted(
+ glob.glob(os.path.join(DIR, "../../../sql/upgrades/*.sql")), key=sortkey,
+ )
+
+ nb_migrations = 0
+
+ for filename in filenames:
+ (version_str, ext) = os.path.splitext(os.path.basename(filename))
+ assert ext == ".sql"
+ version = int(version_str)
+
+ if version <= current_version:
+ # this migration file is older than the current schema version
+ assert nb_migrations == 0
+ continue
+
+ nb_migrations += 1
+ psql_run_file(db.conn.dsn, filename)
+
+ assert nb_migrations, "no migrations applied"
+
+ @pytest.mark.parametrize("migrate_after_insert", (True, False))
+ def test_content(self, storage, sample_data, migrate_after_insert):
+ swh_storage = storage
+ if migrate_after_insert:
+ swh_storage.content_add(sample_data.contents)
+
+ with swh_storage.db() as db:
+ self._migrate(db)
+
+ if not migrate_after_insert:
+ swh_storage.content_add(sample_data.contents)
+
+ for content in sample_data.contents:
+ assert not list(swh_storage.content_missing([content.to_dict()]))
+ assert swh_storage.content_get([content.sha1]) == [
+ attr.evolve(content, data=None)
+ ]
+
+ @pytest.mark.parametrize("migrate_after_insert", (True, False))
+ def test_skipped_content(self, storage, sample_data, migrate_after_insert):
+ swh_storage = storage
+ if migrate_after_insert:
+ swh_storage.skipped_content_add(sample_data.skipped_contents)
+
+ with swh_storage.db() as db:
+ self._migrate(db)
+
+ if not migrate_after_insert:
+ swh_storage.skipped_content_add(sample_data.skipped_contents)
+
+ for skipped_content in sample_data.skipped_contents:
+ assert not list(
+ swh_storage.skipped_content_missing([skipped_content.to_dict()])
+ )
+
+ @pytest.mark.parametrize("migrate_after_insert", (True, False))
+ def test_directory(self, storage, sample_data, migrate_after_insert):
+ swh_storage = storage
+ if migrate_after_insert:
+ swh_storage.directory_add(sample_data.directories)
+ swh_storage.content_add(sample_data.contents)
+
+ with swh_storage.db() as db:
+ self._migrate(db)
+
+ if not migrate_after_insert:
+ swh_storage.directory_add(sample_data.directories)
+ swh_storage.content_add(sample_data.contents)
+
+ for directory in sample_data.directories:
+ assert not list(swh_storage.directory_missing([directory.id]))
+
+ actual_data = list(swh_storage.directory_ls(directory.id))
+ expected_data = list(transform_entries(storage, directory))
+
+ for data in actual_data:
+ assert data in expected_data
+
+ @pytest.mark.parametrize("migrate_after_insert", (True, False))
+ def test_revision(self, storage, sample_data, migrate_after_insert):
+ swh_storage = storage
+ if migrate_after_insert:
+ swh_storage.revision_add(sample_data.revisions)
+
+ with swh_storage.db() as db:
+ self._migrate(db)
+
+ if not migrate_after_insert:
+ swh_storage.revision_add(sample_data.revisions)
+
+ for revision in sample_data.revisions:
+ assert not list(swh_storage.revision_missing([revision.id]))
+ assert swh_storage.revision_get([revision.id]) == [revision]
+
+ @pytest.mark.parametrize("migrate_after_insert", (True, False))
+ def test_release(self, storage, sample_data, migrate_after_insert):
+ swh_storage = storage
+ if migrate_after_insert:
+ swh_storage.release_add(sample_data.releases)
+
+ with swh_storage.db() as db:
+ self._migrate(db)
+
+ if not migrate_after_insert:
+ swh_storage.release_add(sample_data.releases)
+
+ for release in sample_data.releases:
+ assert not list(swh_storage.release_missing([release.id]))
+ assert swh_storage.release_get([release.id]) == [release]
+
+ @pytest.mark.parametrize("migrate_after_insert", (True, False))
+ def test_snapshot(self, storage, sample_data, migrate_after_insert):
+ swh_storage = storage
+ if migrate_after_insert:
+ swh_storage.snapshot_add(sample_data.snapshots)
+
+ with swh_storage.db() as db:
+ self._migrate(db)
+
+ if not migrate_after_insert:
+ swh_storage.snapshot_add(sample_data.snapshots)
+
+ for snapshot in sample_data.snapshots:
+ assert not list(swh_storage.snapshot_missing([snapshot.id]))
+ assert swh_storage.snapshot_get(snapshot.id) == {
+ **snapshot.to_dict(),
+ "next_branch": None,
+ }

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 3:29 PM (2 d, 21 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216426

Event Timeline