Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/mypy.ini b/mypy.ini
index e7ee430..9da0730 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,33 +1,33 @@
[mypy]
namespace_packages = True
# due to the conditional import logic on swh.journal, in some cases a specific
-# type: ignore is neede, in other it isn't...
+# type: ignore is needed, in other it isn't...
warn_unused_ignores = False
# support for sqlalchemy magic: see https://github.com/dropbox/sqlalchemy-stubs
plugins = sqlmypy
# 3rd party libraries without stubs (yet)
# only shipped indirectly via hypothesis
[mypy-django.*]
ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
[mypy-psycopg2.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
# temporary work-around for landing typing support in spite of the current
# journal<->storage dependency loop
[mypy-swh.journal.*]
ignore_missing_imports = True
[mypy-pytest_postgresql.*]
ignore_missing_imports = True
diff --git a/sql/upgrades/032.sql b/sql/upgrades/032.sql
index 3804909..d245c13 100644
--- a/sql/upgrades/032.sql
+++ b/sql/upgrades/032.sql
@@ -1,121 +1,121 @@
-- SWH DB schema upgrade
-- from_version: 30
-- to_version: 32
--- description: Reading data improvment on directory and release data.
+-- description: Reading data improvement on directory and release data.
insert into dbversion(version, release, description)
values(32, now(), 'Work In Progress');
CREATE FUNCTION swh_mktemp_release_get()
returns void
language sql
as $$
create temporary table tmp_release_get(
id sha1_git primary key
) on commit drop;
$$;
-- Detailed entry for a release
CREATE TYPE release_entry AS
(
id sha1_git,
revision sha1_git,
date timestamptz,
date_offset smallint,
name text,
comment bytea,
synthetic boolean,
author_name bytea,
author_email bytea
);
-- Detailed entry for release
CREATE OR REPLACE FUNCTION swh_release_get()
returns setof release_entry
language plpgsql
as $$
begin
return query
select r.id, r.revision, r.date, r.date_offset, r.name, r.comment,
r.synthetic, p.name as author_name, p.email as author_email
from tmp_release_get t
inner join release r on t.id = r.id
inner join person p on p.id = r.author;
return;
end
$$;
DROP TYPE IF EXISTS directory_entry CASCADE;
-- a directory listing entry with all the metadata
--
-- can be used to list a directory, and retrieve all the data in one go.
CREATE TYPE directory_entry AS
(
dir_id sha1_git, -- id of the parent directory
type directory_entry_type, -- type of entry
target sha1_git, -- id of target
name unix_path, -- path name, relative to containing dir
perms file_perms, -- unix-like permissions
status content_status, -- visible or absent
sha1 sha1, -- content if sha1 if type is not dir
sha1_git sha1_git, -- content's sha1 git if type is not dir
sha256 sha256 -- content's sha256 if type is not dir
);
-- List a single level of directory walked_dir_id
-- FIXME: order by name is not correct. For git, we need to order by
-- lexicographic order but as if a trailing / is present in directory
-- name
create or replace function swh_directory_walk_one(walked_dir_id sha1_git)
returns setof directory_entry
language sql
stable
as $$
with dir as (
select id as dir_id, dir_entries, file_entries, rev_entries
from directory
where id = walked_dir_id),
ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir),
ls_f as (select dir_id, unnest(file_entries) as entry_id from dir),
ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir)
(select dir_id, 'dir'::directory_entry_type as type,
e.target, e.name, e.perms, NULL::content_status,
NULL::sha1, NULL::sha1_git, NULL::sha256
from ls_d
left join directory_entry_dir e on ls_d.entry_id = e.id)
union
(select dir_id, 'file'::directory_entry_type as type,
e.target, e.name, e.perms, c.status,
c.sha1, c.sha1_git, c.sha256
from ls_f
left join directory_entry_file e on ls_f.entry_id = e.id
left join content c on e.target = c.sha1_git)
union
(select dir_id, 'rev'::directory_entry_type as type,
e.target, e.name, e.perms, NULL::content_status,
NULL::sha1, NULL::sha1_git, NULL::sha256
from ls_r
left join directory_entry_rev e on ls_r.entry_id = e.id)
order by name;
$$;
-- List recursively the content of a directory
create or replace function swh_directory_walk(walked_dir_id sha1_git)
returns setof directory_entry
language sql
stable
as $$
with recursive entries as (
select dir_id, type, target, name, perms, status, sha1, sha1_git,
sha256
from swh_directory_walk_one(walked_dir_id)
union all
select dir_id, type, target, (dirname || '/' || name)::unix_path as name,
perms, status, sha1, sha1_git, sha256
from (select (swh_directory_walk_one(dirs.target)).*, dirs.name as dirname
from (select target, name from entries where type = 'dir') as dirs) as with_parent
)
select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256
from entries
$$;
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
index 7c1a88d..46b3ab2 100644
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -1,1121 +1,1121 @@
create or replace function hash_sha1(text)
returns text
as $$
select encode(digest($1, 'sha1'), 'hex')
$$ language sql strict immutable;
comment on function hash_sha1(text) is 'Compute SHA1 hash as text';
-- create a temporary table called tmp_TBLNAME, mimicking existing table
-- TBLNAME
--
-- Args:
--- tblname: name of the table to mimick
+-- tblname: name of the table to mimic
create or replace function swh_mktemp(tblname regclass)
returns void
language plpgsql
as $$
begin
execute format('
create temporary table tmp_%1$I
(like %1$I including defaults)
on commit drop;
alter table tmp_%1$I drop column if exists object_id;
', tblname);
return;
end
$$;
-- create a temporary table for directory entries called tmp_TBLNAME,
-- mimicking existing table TBLNAME with an extra dir_id (sha1_git)
-- column, and dropping the id column.
--
-- This is used to create the tmp_directory_entry_<foo> tables.
--
-- Args:
--- tblname: name of the table to mimick
+-- tblname: name of the table to mimic
create or replace function swh_mktemp_dir_entry(tblname regclass)
returns void
language plpgsql
as $$
begin
execute format('
create temporary table tmp_%1$I
(like %1$I including defaults, dir_id sha1_git)
on commit drop;
alter table tmp_%1$I drop column id;
', tblname);
return;
end
$$;
-- create a temporary table for revisions called tmp_revisions,
-- mimicking existing table revision, replacing the foreign keys to
-- people with an email and name field
--
create or replace function swh_mktemp_revision()
returns void
language sql
as $$
create temporary table tmp_revision (
like revision including defaults,
author_fullname bytea,
author_name bytea,
author_email bytea,
committer_fullname bytea,
committer_name bytea,
committer_email bytea
) on commit drop;
alter table tmp_revision drop column author;
alter table tmp_revision drop column committer;
alter table tmp_revision drop column object_id;
$$;
-- create a temporary table for releases called tmp_release,
-- mimicking existing table release, replacing the foreign keys to
-- people with an email and name field
--
create or replace function swh_mktemp_release()
returns void
language sql
as $$
create temporary table tmp_release (
like release including defaults,
author_fullname bytea,
author_name bytea,
author_email bytea
) on commit drop;
alter table tmp_release drop column author;
alter table tmp_release drop column object_id;
$$;
-- create a temporary table for the branches of a snapshot
create or replace function swh_mktemp_snapshot_branch()
returns void
language sql
as $$
create temporary table tmp_snapshot_branch (
name bytea not null,
target bytea,
target_type snapshot_target
) on commit drop;
$$;
create or replace function swh_mktemp_tool()
returns void
language sql
as $$
create temporary table tmp_tool (
like tool including defaults
) on commit drop;
alter table tmp_tool drop column id;
$$;
-- a content signature is a set of cryptographic checksums that we use to
-- uniquely identify content, for the purpose of verifying if we already have
-- some content or not during content injection
create type content_signature as (
sha1 sha1,
sha1_git sha1_git,
sha256 sha256,
blake2s256 blake2s256
);
-- check which entries of tmp_skipped_content are missing from skipped_content
--
-- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
-- 2. call this function
create or replace function swh_skipped_content_missing()
returns setof content_signature
language plpgsql
as $$
begin
return query
select sha1, sha1_git, sha256, blake2s256 from tmp_skipped_content t
where not exists
(select 1 from skipped_content s where
s.sha1 is not distinct from t.sha1 and
s.sha1_git is not distinct from t.sha1_git and
s.sha256 is not distinct from t.sha256);
return;
end
$$;
-- Look up content based on one or several different checksums. Return all
-- content information if the content is found; a NULL row otherwise.
--
-- At least one checksum should be not NULL. If several are not NULL, they will
-- be AND-ed together in the lookup query.
--
-- Note: this function is meant to be used to look up individual contents
-- (e.g., for the web app), for batch lookup of missing content (e.g., to be
-- added) see swh_content_missing
create or replace function swh_content_find(
sha1 sha1 default NULL,
sha1_git sha1_git default NULL,
sha256 sha256 default NULL,
blake2s256 blake2s256 default NULL
)
returns content
language plpgsql
as $$
declare
con content;
filters text[] := array[] :: text[]; -- AND-clauses used to filter content
q text;
begin
if sha1 is not null then
filters := filters || format('sha1 = %L', sha1);
end if;
if sha1_git is not null then
filters := filters || format('sha1_git = %L', sha1_git);
end if;
if sha256 is not null then
filters := filters || format('sha256 = %L', sha256);
end if;
if blake2s256 is not null then
filters := filters || format('blake2s256 = %L', blake2s256);
end if;
if cardinality(filters) = 0 then
return null;
else
q = format('select * from content where %s',
array_to_string(filters, ' and '));
execute q into con;
return con;
end if;
end
$$;
-- add tmp_content entries to content, skipping duplicates
--
-- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content,
-- 2. call this function
create or replace function swh_content_add()
returns void
language plpgsql
as $$
begin
insert into content (sha1, sha1_git, sha256, blake2s256, length, status, ctime)
select distinct sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content;
return;
end
$$;
-- add tmp_skipped_content entries to skipped_content, skipping duplicates
--
-- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
-- 2. call this function
create or replace function swh_skipped_content_add()
returns void
language plpgsql
as $$
begin
insert into skipped_content (sha1, sha1_git, sha256, blake2s256, length, status, reason, origin)
select distinct sha1, sha1_git, sha256, blake2s256, length, status, reason, origin
from tmp_skipped_content
where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in (
select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')
from swh_skipped_content_missing()
);
-- TODO XXX use postgres 9.5 "UPSERT" support here, when available.
-- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid
-- the extra swh_content_missing() query here.
return;
end
$$;
-- Update content entries from temporary table.
-- (columns are potential new columns added to the schema, this cannot be empty)
--
create or replace function swh_content_update(columns_update text[])
returns void
language plpgsql
as $$
declare
query text;
tmp_array text[];
begin
if array_length(columns_update, 1) = 0 then
raise exception 'Please, provide the list of column names to update.';
end if;
tmp_array := array(select format('%1$s=t.%1$s', unnest) from unnest(columns_update));
query = format('update content set %s
from tmp_content t where t.sha1 = content.sha1',
array_to_string(tmp_array, ', '));
execute query;
return;
end
$$;
comment on function swh_content_update(text[]) IS 'Update existing content''s columns';
-- check which entries of tmp_directory are missing from directory
--
-- operates in bulk: 0. swh_mktemp(directory), 1. COPY to tmp_directory,
-- 2. call this function
create or replace function swh_directory_missing()
returns setof sha1_git
language plpgsql
as $$
begin
return query
select id from tmp_directory t
where not exists (
select 1 from directory d
where d.id = t.id);
return;
end
$$;
create type directory_entry_type as enum('file', 'dir', 'rev');
-- Add tmp_directory_entry_* entries to directory_entry_* and directory,
-- skipping duplicates in directory_entry_*. This is a generic function that
-- works on all kind of directory entries.
--
-- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_*'), 1 COPY to
-- tmp_directory_entry_*, 2. call this function
--
-- Assumption: this function is used in the same transaction that inserts the
-- context directory in table "directory".
create or replace function swh_directory_entry_add(typ directory_entry_type)
returns void
language plpgsql
as $$
begin
execute format('
insert into directory_entry_%1$s (target, name, perms)
select distinct t.target, t.name, t.perms
from tmp_directory_entry_%1$s t
where not exists (
select 1
from directory_entry_%1$s i
where t.target = i.target and t.name = i.name and t.perms = i.perms)
', typ);
execute format('
with new_entries as (
select t.dir_id, array_agg(i.id) as entries
from tmp_directory_entry_%1$s t
inner join directory_entry_%1$s i
using (target, name, perms)
group by t.dir_id
)
update tmp_directory as d
set %1$s_entries = new_entries.entries
from new_entries
where d.id = new_entries.dir_id
', typ);
return;
end
$$;
-- Insert the data from tmp_directory, tmp_directory_entry_file,
-- tmp_directory_entry_dir, tmp_directory_entry_rev into their final
-- tables.
--
-- Prerequisites:
-- directory ids in tmp_directory
-- entries in tmp_directory_entry_{file,dir,rev}
--
create or replace function swh_directory_add()
returns void
language plpgsql
as $$
begin
perform swh_directory_entry_add('file');
perform swh_directory_entry_add('dir');
perform swh_directory_entry_add('rev');
insert into directory
select * from tmp_directory t
where not exists (
select 1 from directory d
where d.id = t.id);
return;
end
$$;
-- a directory listing entry with all the metadata
--
-- can be used to list a directory, and retrieve all the data in one go.
create type directory_entry as
(
dir_id sha1_git, -- id of the parent directory
type directory_entry_type, -- type of entry
target sha1_git, -- id of target
name unix_path, -- path name, relative to containing dir
perms file_perms, -- unix-like permissions
status content_status, -- visible or absent
sha1 sha1, -- content if sha1 if type is not dir
sha1_git sha1_git, -- content's sha1 git if type is not dir
sha256 sha256, -- content's sha256 if type is not dir
length bigint -- content length if type is not dir
);
-- List a single level of directory walked_dir_id
-- FIXME: order by name is not correct. For git, we need to order by
-- lexicographic order but as if a trailing / is present in directory
-- name
create or replace function swh_directory_walk_one(walked_dir_id sha1_git)
returns setof directory_entry
language sql
stable
as $$
with dir as (
select id as dir_id, dir_entries, file_entries, rev_entries
from directory
where id = walked_dir_id),
ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir),
ls_f as (select dir_id, unnest(file_entries) as entry_id from dir),
ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir)
(select dir_id, 'dir'::directory_entry_type as type,
e.target, e.name, e.perms, NULL::content_status,
NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint
from ls_d
left join directory_entry_dir e on ls_d.entry_id = e.id)
union
(select dir_id, 'file'::directory_entry_type as type,
e.target, e.name, e.perms, c.status,
c.sha1, c.sha1_git, c.sha256, c.length
from ls_f
left join directory_entry_file e on ls_f.entry_id = e.id
left join content c on e.target = c.sha1_git)
union
(select dir_id, 'rev'::directory_entry_type as type,
e.target, e.name, e.perms, NULL::content_status,
NULL::sha1, NULL::sha1_git, NULL::sha256, NULL::bigint
from ls_r
left join directory_entry_rev e on ls_r.entry_id = e.id)
order by name;
$$;
-- List recursively the revision directory arborescence
create or replace function swh_directory_walk(walked_dir_id sha1_git)
returns setof directory_entry
language sql
stable
as $$
with recursive entries as (
select dir_id, type, target, name, perms, status, sha1, sha1_git,
sha256, length
from swh_directory_walk_one(walked_dir_id)
union all
select dir_id, type, target, (dirname || '/' || name)::unix_path as name,
perms, status, sha1, sha1_git, sha256, length
from (select (swh_directory_walk_one(dirs.target)).*, dirs.name as dirname
from (select target, name from entries where type = 'dir') as dirs) as with_parent
)
select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256, length
from entries
$$;
create or replace function swh_revision_walk(revision_id sha1_git)
returns setof directory_entry
language sql
stable
as $$
select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256, length
from swh_directory_walk((select directory from revision where id=revision_id))
$$;
COMMENT ON FUNCTION swh_revision_walk(sha1_git) IS 'Recursively list the revision targeted directory arborescence';
-- Find a directory entry by its path
create or replace function swh_find_directory_entry_by_path(
walked_dir_id sha1_git,
dir_or_content_path bytea[])
returns directory_entry
language plpgsql
as $$
declare
end_index integer;
paths bytea default '';
path bytea;
res bytea[];
r record;
begin
end_index := array_upper(dir_or_content_path, 1);
res[1] := walked_dir_id;
for i in 1..end_index
loop
path := dir_or_content_path[i];
-- concatenate path for patching the name in the result record (if we found it)
if i = 1 then
paths = path;
else
paths := paths || '/' || path; -- concatenate paths
end if;
if i <> end_index then
select *
from swh_directory_walk_one(res[i] :: sha1_git)
where name=path
and type = 'dir'
limit 1 into r;
else
select *
from swh_directory_walk_one(res[i] :: sha1_git)
where name=path
limit 1 into r;
end if;
-- find the path
if r is null then
return null;
else
-- store the next dir to lookup the next local path from
res[i+1] := r.target;
end if;
end loop;
-- at this moment, r is the result. Patch its 'name' with the full path before returning it.
r.name := paths;
return r;
end
$$;
-- List all revision IDs starting from a given revision, going back in time
--
-- TODO ordering: should be breadth-first right now (what do we want?)
-- TODO ordering: ORDER BY parent_rank somewhere?
create or replace function swh_revision_list(root_revisions bytea[], num_revs bigint default NULL)
returns table (id sha1_git, parents bytea[])
language sql
stable
as $$
with recursive full_rev_list(id) as (
(select id from revision where id = ANY(root_revisions))
union
(select h.parent_id
from revision_history as h
join full_rev_list on h.id = full_rev_list.id)
),
rev_list as (select id from full_rev_list limit num_revs)
select rev_list.id as id,
array(select rh.parent_id::bytea
from revision_history rh
where rh.id = rev_list.id
order by rh.parent_rank
) as parent
from rev_list;
$$;
-- List all the children of a given revision
create or replace function swh_revision_list_children(root_revisions bytea[], num_revs bigint default NULL)
returns table (id sha1_git, parents bytea[])
language sql
stable
as $$
with recursive full_rev_list(id) as (
(select id from revision where id = ANY(root_revisions))
union
(select h.id
from revision_history as h
join full_rev_list on h.parent_id = full_rev_list.id)
),
rev_list as (select id from full_rev_list limit num_revs)
select rev_list.id as id,
array(select rh.parent_id::bytea
from revision_history rh
where rh.id = rev_list.id
order by rh.parent_rank
) as parent
from rev_list;
$$;
-- Detailed entry for a revision
create type revision_entry as
(
id sha1_git,
date timestamptz,
date_offset smallint,
date_neg_utc_offset boolean,
committer_date timestamptz,
committer_date_offset smallint,
committer_date_neg_utc_offset boolean,
type revision_type,
directory sha1_git,
message bytea,
author_id bigint,
author_fullname bytea,
author_name bytea,
author_email bytea,
committer_id bigint,
committer_fullname bytea,
committer_name bytea,
committer_email bytea,
metadata jsonb,
synthetic boolean,
parents bytea[],
object_id bigint
);
-- "git style" revision log. Similar to swh_revision_list(), but returning all
-- information associated to each revision, and expanding authors/committers
create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
returns setof revision_entry
language sql
stable
as $$
select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
r.type, r.directory, r.message,
a.id, a.fullname, a.name, a.email,
c.id, c.fullname, c.name, c.email,
r.metadata, r.synthetic, t.parents, r.object_id
from swh_revision_list(root_revisions, num_revs) as t
left join revision r on t.id = r.id
left join person a on a.id = r.author
left join person c on c.id = r.committer;
$$;
-- Detailed entry for a release
create type release_entry as
(
id sha1_git,
target sha1_git,
target_type object_type,
date timestamptz,
date_offset smallint,
date_neg_utc_offset boolean,
name bytea,
comment bytea,
synthetic boolean,
author_id bigint,
author_fullname bytea,
author_name bytea,
author_email bytea,
object_id bigint
);
-- Create entries in person from tmp_revision
create or replace function swh_person_add_from_revision()
returns void
language plpgsql
as $$
begin
with t as (
select author_fullname as fullname, author_name as name, author_email as email from tmp_revision
union
select committer_fullname as fullname, committer_name as name, committer_email as email from tmp_revision
) insert into person (fullname, name, email)
select distinct on (fullname) fullname, name, email from t
where not exists (
select 1
from person p
where t.fullname = p.fullname
);
return;
end
$$;
-- Create entries in revision from tmp_revision
create or replace function swh_revision_add()
returns void
language plpgsql
as $$
begin
perform swh_person_add_from_revision();
insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic)
select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic
from tmp_revision t
left join person a on a.fullname = t.author_fullname
left join person c on c.fullname = t.committer_fullname;
return;
end
$$;
-- Create entries in person from tmp_release
create or replace function swh_person_add_from_release()
returns void
language plpgsql
as $$
begin
with t as (
select distinct author_fullname as fullname, author_name as name, author_email as email from tmp_release
where author_fullname is not null
) insert into person (fullname, name, email)
select distinct on (fullname) fullname, name, email from t
where not exists (
select 1
from person p
where t.fullname = p.fullname
);
return;
end
$$;
-- Create entries in release from tmp_release
create or replace function swh_release_add()
returns void
language plpgsql
as $$
begin
perform swh_person_add_from_release();
insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic)
select t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic
from tmp_release t
left join person a on a.fullname = t.author_fullname;
return;
end
$$;
-- add a new origin_visit for origin origin_id at date.
--
-- Returns the new visit id.
create or replace function swh_origin_visit_add(origin_url text, date timestamptz, type text)
returns bigint
language sql
as $$
with origin_id as (
select id
from origin
where url = origin_url
), last_known_visit as (
select coalesce(max(visit), 0) as visit
from origin_visit
where origin = (select id from origin_id)
)
insert into origin_visit (origin, date, type, visit, status)
values ((select id from origin_id), date, type,
(select visit from last_known_visit) + 1, 'ongoing')
returning visit;
$$;
create or replace function swh_snapshot_add(snapshot_id sha1_git)
returns void
language plpgsql
as $$
declare
snapshot_object_id snapshot.object_id%type;
begin
select object_id from snapshot where id = snapshot_id into snapshot_object_id;
if snapshot_object_id is null then
insert into snapshot (id) values (snapshot_id) returning object_id into snapshot_object_id;
insert into snapshot_branch (name, target_type, target)
select name, target_type, target from tmp_snapshot_branch tmp
where not exists (
select 1
from snapshot_branch sb
where sb.name = tmp.name
and sb.target = tmp.target
and sb.target_type = tmp.target_type
)
on conflict do nothing;
insert into snapshot_branches (snapshot_id, branch_id)
select snapshot_object_id, sb.object_id as branch_id
from tmp_snapshot_branch tmp
join snapshot_branch sb
using (name, target, target_type)
where tmp.target is not null and tmp.target_type is not null
union
select snapshot_object_id, sb.object_id as branch_id
from tmp_snapshot_branch tmp
join snapshot_branch sb
using (name)
where tmp.target is null and tmp.target_type is null
and sb.target is null and sb.target_type is null;
end if;
truncate table tmp_snapshot_branch;
end;
$$;
create type snapshot_result as (
snapshot_id sha1_git,
name bytea,
target bytea,
target_type snapshot_target
);
create or replace function swh_snapshot_get_by_id(id sha1_git,
branches_from bytea default '', branches_count bigint default null,
target_types snapshot_target[] default NULL)
returns setof snapshot_result
language sql
stable
as $$
select
swh_snapshot_get_by_id.id as snapshot_id, name, target, target_type
from snapshot_branches
inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id
where snapshot_id = (select object_id from snapshot where snapshot.id = swh_snapshot_get_by_id.id)
and (target_types is null or target_type = any(target_types))
and name >= branches_from
order by name limit branches_count
$$;
create type snapshot_size as (
target_type snapshot_target,
count bigint
);
create or replace function swh_snapshot_count_branches(id sha1_git)
returns setof snapshot_size
language sql
stable
as $$
SELECT target_type, count(name)
from swh_snapshot_get_by_id(swh_snapshot_count_branches.id)
group by target_type;
$$;
-- Absolute path: directory reference + complete path relative to it
create type content_dir as (
directory sha1_git,
path unix_path
);
-- Find the containing directory of a given content, specified by sha1
-- (note: *not* sha1_git).
--
-- Return a pair (dir_it, path) where path is a UNIX path that, from the
-- directory root, reach down to a file with the desired content. Return NULL
-- if no match is found.
--
-- In case of multiple paths (i.e., pretty much always), an arbitrary one is
-- chosen.
create or replace function swh_content_find_directory(content_id sha1)
returns content_dir
language sql
stable
as $$
with recursive path as (
-- Recursively build a path from the requested content to a root
-- directory. Each iteration returns a pair (dir_id, filename) where
-- filename is relative to dir_id. Stops when no parent directory can
-- be found.
(select dir.id as dir_id, dir_entry_f.name as name, 0 as depth
from directory_entry_file as dir_entry_f
join content on content.sha1_git = dir_entry_f.target
join directory as dir on dir.file_entries @> array[dir_entry_f.id]
where content.sha1 = content_id
limit 1)
union all
(select dir.id as dir_id,
(dir_entry_d.name || '/' || path.name)::unix_path as name,
path.depth + 1
from path
join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id
join directory as dir on dir.dir_entries @> array[dir_entry_d.id]
limit 1)
)
select dir_id, name from path order by depth desc limit 1;
$$;
-- Find the visit of origin closest to date visit_date
-- Breaks ties by selecting the largest visit id
create or replace function swh_visit_find_by_date(origin_url text, visit_date timestamptz default NOW())
returns setof origin_visit
language plpgsql
stable
as $$
declare
origin_id bigint;
begin
select id into origin_id from origin where url=origin_url;
return query
with closest_two_visits as ((
select ov, (date - visit_date), visit as interval
from origin_visit ov
where ov.origin = origin_id
and ov.date >= visit_date
order by ov.date asc, ov.visit desc
limit 1
) union (
select ov, (visit_date - date), visit as interval
from origin_visit ov
where ov.origin = origin_id
and ov.date < visit_date
order by ov.date desc, ov.visit desc
limit 1
)) select (ov).* from closest_two_visits order by interval, visit limit 1;
end
$$;
-- Object listing by object_id
create or replace function swh_content_list_by_object_id(
min_excl bigint,
max_incl bigint
)
returns setof content
language sql
stable
as $$
select * from content
where object_id > min_excl and object_id <= max_incl
order by object_id;
$$;
create or replace function swh_revision_list_by_object_id(
min_excl bigint,
max_incl bigint
)
returns setof revision_entry
language sql
stable
as $$
with revs as (
select * from revision
where object_id > min_excl and object_id <= max_incl
)
select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
r.type, r.directory, r.message,
a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
as parents, r.object_id
from revs r
left join person a on a.id = r.author
left join person c on c.id = r.committer
order by r.object_id;
$$;
create or replace function swh_release_list_by_object_id(
min_excl bigint,
max_incl bigint
)
returns setof release_entry
language sql
stable
as $$
with rels as (
select * from release
where object_id > min_excl and object_id <= max_incl
)
select r.id, r.target, r.target_type, r.date, r.date_offset, r.date_neg_utc_offset, r.name, r.comment,
r.synthetic, p.id as author_id, p.fullname as author_fullname, p.name as author_name, p.email as author_email, r.object_id
from rels r
left join person p on p.id = r.author
order by r.object_id;
$$;
-- end revision_metadata functions
-- origin_metadata functions
create type origin_metadata_signature as (
id bigint,
origin_url text,
discovery_date timestamptz,
tool_id bigint,
metadata jsonb,
provider_id integer,
provider_name text,
provider_type text,
provider_url text
);
create or replace function swh_origin_metadata_get_by_origin(
origin text)
returns setof origin_metadata_signature
language sql
stable
as $$
select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
mp.id as provider_id, provider_name, provider_type, provider_url
from origin_metadata as om
inner join metadata_provider mp on om.provider_id = mp.id
inner join origin o on om.origin_id = o.id
where o.url = origin
order by discovery_date desc;
$$;
create or replace function swh_origin_metadata_get_by_provider_type(
origin_url text,
provider_type text)
returns setof origin_metadata_signature
language sql
stable
as $$
select om.id as id, o.url as origin_url, discovery_date, tool_id, om.metadata,
mp.id as provider_id, provider_name, provider_type, provider_url
from origin_metadata as om
inner join metadata_provider mp on om.provider_id = mp.id
inner join origin o on om.origin_id = o.id
where o.url = origin_url
and mp.provider_type = provider_type
order by discovery_date desc;
$$;
-- end origin_metadata functions
-- add tmp_tool entries to tool,
-- skipping duplicates if any.
--
-- operates in bulk: 0. create temporary tmp_tool, 1. COPY to
-- it, 2. call this function to insert and filtering out duplicates
create or replace function swh_tool_add()
returns setof tool
language plpgsql
as $$
begin
insert into tool(name, version, configuration)
select name, version, configuration from tmp_tool tmp
on conflict(name, version, configuration) do nothing;
return query
select id, name, version, configuration
from tmp_tool join tool
using(name, version, configuration);
return;
end
$$;
-- simple counter mapping a textual label to an integer value
create type counter as (
label text,
value bigint
);
-- return statistics about the number of tuples in various SWH tables
--
-- Note: the returned values are based on postgres internal statistics
-- (pg_class table), which are only updated daily (by autovacuum) or so
create or replace function swh_stat_counters()
returns setof counter
language sql
stable
as $$
select object_type as label, value as value
from object_counts
where object_type in (
'content',
'directory',
'directory_entry_dir',
'directory_entry_file',
'directory_entry_rev',
'origin',
'origin_visit',
'person',
'release',
'revision',
'revision_history',
'skipped_content',
'snapshot'
);
$$;
create or replace function swh_update_counter(object_type text)
returns void
language plpgsql
as $$
begin
execute format('
insert into object_counts
(value, last_update, object_type)
values
((select count(*) from %1$I), NOW(), %1$L)
on conflict (object_type) do update set
value = excluded.value,
last_update = excluded.last_update',
object_type);
return;
end;
$$;
create or replace function swh_update_counter_bucketed()
returns void
language plpgsql
as $$
declare
query text;
line_to_update int;
new_value bigint;
begin
select
object_counts_bucketed.line,
format(
'select count(%I) from %I where %s',
coalesce(identifier, '*'),
object_type,
coalesce(
concat_ws(
' and ',
case when bucket_start is not null then
format('%I >= %L', identifier, bucket_start) -- lower bound condition, inclusive
end,
case when bucket_end is not null then
format('%I < %L', identifier, bucket_end) -- upper bound condition, exclusive
end
),
'true'
)
)
from object_counts_bucketed
order by coalesce(last_update, now() - '1 month'::interval) asc
limit 1
into line_to_update, query;
execute query into new_value;
update object_counts_bucketed
set value = new_value,
last_update = now()
where object_counts_bucketed.line = line_to_update;
END
$$;
create or replace function swh_update_counters_from_buckets()
returns trigger
language plpgsql
as $$
begin
with to_update as (
select object_type, sum(value) as value, max(last_update) as last_update
from object_counts_bucketed ob1
where not exists (
select 1 from object_counts_bucketed ob2
where ob1.object_type = ob2.object_type
and value is null
)
group by object_type
) update object_counts
set
value = to_update.value,
last_update = to_update.last_update
from to_update
where
object_counts.object_type = to_update.object_type
and object_counts.value != to_update.value;
return null;
end
$$;
create trigger update_counts_from_bucketed
after insert or update
on object_counts_bucketed
for each row
when (NEW.line % 256 = 0)
execute procedure swh_update_counters_from_buckets();
diff --git a/swh/storage/tests/conftest.py b/swh/storage/tests/conftest.py
index 94986c8..49b979e 100644
--- a/swh/storage/tests/conftest.py
+++ b/swh/storage/tests/conftest.py
@@ -1,267 +1,267 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import glob
import pytest
from typing import Union
from pytest_postgresql import factories
from pytest_postgresql.janitor import DatabaseJanitor, psycopg2, Version
from os import path, environ
from hypothesis import settings
from typing import Dict
import swh.storage
from swh.core.utils import numfile_sortkey as sortkey
from swh.model.tests.generate_testdata import gen_contents, gen_origins
SQL_DIR = path.join(path.dirname(swh.storage.__file__), 'sql')
environ['LC_ALL'] = 'C.UTF-8'
DUMP_FILES = path.join(SQL_DIR, '*.sql')
# define tests profile. Full documentation is at:
# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
settings.register_profile("fast", max_examples=5, deadline=5000)
settings.register_profile("slow", max_examples=20, deadline=5000)
@pytest.fixture
def swh_storage(postgresql_proc, swh_storage_postgresql):
storage_config = {
'cls': 'local',
'args': {
'db': 'postgresql://{user}@{host}:{port}/{dbname}'.format(
host=postgresql_proc.host,
port=postgresql_proc.port,
user='postgres',
dbname='tests'),
'objstorage': {
'cls': 'memory',
'args': {}
},
'journal_writer': {
'cls': 'memory',
},
},
}
storage = swh.storage.get_storage(**storage_config)
return storage
@pytest.fixture
def swh_contents(swh_storage):
contents = gen_contents(n=20)
swh_storage.content_add(contents)
return contents
@pytest.fixture
def swh_origins(swh_storage):
origins = gen_origins(n=100)
swh_storage.origin_add(origins)
return origins
# the postgres_fact factory fixture below is mostly a copy of the code
# from pytest-postgresql. We need a custom version here to be able to
# specify our version of the DBJanitor we use.
def postgresql_fact(process_fixture_name, db_name=None, dump_files=DUMP_FILES):
@pytest.fixture
def postgresql_factory(request):
"""
Fixture factory for PostgreSQL.
:param FixtureRequest request: fixture request object
:rtype: psycopg2.connection
:returns: postgresql client
"""
config = factories.get_config(request)
if not psycopg2:
raise ImportError(
'No module named psycopg2. Please install it.'
)
proc_fixture = request.getfixturevalue(process_fixture_name)
# _, config = try_import('psycopg2', request)
pg_host = proc_fixture.host
pg_port = proc_fixture.port
pg_user = proc_fixture.user
pg_options = proc_fixture.options
pg_db = db_name or config['dbname']
with SwhDatabaseJanitor(
pg_user, pg_host, pg_port, pg_db, proc_fixture.version,
dump_files=dump_files
):
connection = psycopg2.connect(
dbname=pg_db,
user=pg_user,
host=pg_host,
port=pg_port,
options=pg_options
)
yield connection
connection.close()
return postgresql_factory
swh_storage_postgresql = postgresql_fact('postgresql_proc')
# This version of the DatabaseJanitor implement a different setup/teardown
-# behavior than than the stock one: instead of droping, creating and
+# behavior than than the stock one: instead of dropping, creating and
# initializing the database for each test, it create and initialize the db only
# once, then it truncate the tables. This is needed to have acceptable test
# performances.
class SwhDatabaseJanitor(DatabaseJanitor):
def __init__(
self,
user: str,
host: str,
port: str,
db_name: str,
version: Union[str, float, Version],
dump_files: str = DUMP_FILES
) -> None:
super().__init__(user, host, port, db_name, version)
self.dump_files = sorted(
glob.glob(dump_files), key=sortkey)
def db_setup(self):
with psycopg2.connect(
dbname=self.db_name,
user=self.user,
host=self.host,
port=self.port,
) as cnx:
with cnx.cursor() as cur:
for fname in self.dump_files:
with open(fname) as fobj:
sql = fobj.read().replace('concurrently', '').strip()
if sql:
cur.execute(sql)
cnx.commit()
def db_reset(self):
with psycopg2.connect(
dbname=self.db_name,
user=self.user,
host=self.host,
port=self.port,
) as cnx:
with cnx.cursor() as cur:
cur.execute(
"SELECT table_name FROM information_schema.tables "
"WHERE table_schema = %s", ('public',))
tables = set(table for (table,) in cur.fetchall())
for table in tables:
cur.execute('truncate table %s cascade' % table)
cur.execute(
"SELECT sequence_name FROM information_schema.sequences "
"WHERE sequence_schema = %s", ('public',))
seqs = set(seq for (seq,) in cur.fetchall())
for seq in seqs:
cur.execute('ALTER SEQUENCE %s RESTART;' % seq)
cnx.commit()
def init(self):
with self.cursor() as cur:
cur.execute(
"SELECT COUNT(1) FROM pg_database WHERE datname=%s;",
(self.db_name,))
db_exists = cur.fetchone()[0] == 1
if db_exists:
cur.execute(
'UPDATE pg_database SET datallowconn=true '
'WHERE datname = %s;',
(self.db_name,))
if db_exists:
self.db_reset()
else:
with self.cursor() as cur:
cur.execute('CREATE DATABASE "{}";'.format(self.db_name))
self.db_setup()
def drop(self):
pid_column = 'pid'
with self.cursor() as cur:
cur.execute(
'UPDATE pg_database SET datallowconn=false '
'WHERE datname = %s;', (self.db_name,))
cur.execute(
'SELECT pg_terminate_backend(pg_stat_activity.{})'
'FROM pg_stat_activity '
'WHERE pg_stat_activity.datname = %s;'.format(pid_column),
(self.db_name,))
@pytest.fixture
def sample_data() -> Dict:
"""Pre-defined sample storage object data to manipulate
Returns:
Dict of data (keys: content, directory, revision, person)
"""
sample_content = {
'blake2s256': b'\xbf?\x05\xed\xc1U\xd2\xc5\x168Xm\x93\xde}f(HO@\xd0\xacn\x04\x1e\x9a\xb9\xfa\xbf\xcc\x08\xc7', # noqa
'sha1': b'g\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
'sha1_git': b'\xf2\xae\xfa\xba\xfa\xa6B\x9b^\xf9Z\xf5\x14\x0cna\xb0\xef\x8b', # noqa
'sha256': b"\x87\x022\xedZN\x84\xe8za\xf8'(oA\xc9k\xb1\x80c\x80\xe7J\x06\xea\xd2\xd5\xbeB\x19\xb8\xce", # noqa
'length': 48,
'data': b'temp file for testing content storage conversion',
'status': 'visible',
}
sample_content2 = {
'blake2s256': b'\xbf?\x05\xed\xc1U\xd2\xc5\x168Xm\x93\xde}f(HO@\xd0\xacn\x04\x1e\x9a\xb9\xfa\xbf\xcc\x08\xc7', # noqa
'sha1': b'f\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
'sha1_git': b'\xc2\xae\xfa\xba\xfa\xa6B\x9b^\xf9Z\xf5\x14\x0cna\xb0\xef\x8b', # noqa
'sha256': b"\x77\x022\xedZN\x84\xe8za\xf8'(oA\xc9k\xb1\x80c\x80\xe7J\x06\xea\xd2\xd5\xbeB\x19\xb8\xce", # noqa
'length': 50,
'data': b'temp file for testing content storage conversion 2',
'status': 'visible',
}
sample_directory = {
'id': b'f\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
'entries': []
}
sample_person = {
'name': b'John Doe',
'email': b'john.doe@institute.org',
'fullname': b'John Doe <john.doe@institute.org>'
}
sample_revision = {
'id': b'f\x15y+\xcb][\\\n\xf28\xb2\x0c_P[\xc8\x89Hk',
'message': b'something',
'author': sample_person,
'committer': sample_person,
'date': 1567591673,
'committer_date': 1567591673,
'type': 'tar',
'directory': b'\xc2\xae\xfa\xba\xfa\xa6B\x9b^\xf9Z\xf5\x14\x0cna\xb0\xef\x8b', # noqa
'synthetic': False,
'metadata': {},
'parents': [],
}
return {
'content': [sample_content, sample_content2],
'person': [sample_person],
'directory': [sample_directory],
'revision': [sample_revision],
}

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:25 PM (6 d, 19 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3352693

Event Timeline