Changeset View
Changeset View
Standalone View
Standalone View
sql/swh-func.sql
Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines | as $$ | ||||
) on commit drop; | ) on commit drop; | ||||
$$; | $$; | ||||
-- a content signature is a set of cryptographic checksums that we use to | -- a content signature is a set of cryptographic checksums that we use to | ||||
-- uniquely identify content, for the purpose of verifying if we already have | -- uniquely identify content, for the purpose of verifying if we already have | ||||
-- some content or not during content injection | -- some content or not during content injection | ||||
create type content_signature as ( | create type content_signature as ( | ||||
sha1 sha1, | sha1 sha1, | ||||
sha1_git sha1_git, | sha1_git sha1_git, | ||||
sha256 sha256 | sha256 sha256, | ||||
blake2s256 blake2s256 | |||||
); | ); | ||||
-- check which entries of tmp_content are missing from content | -- check which entries of tmp_content are missing from content | ||||
-- | -- | ||||
-- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, | -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, | ||||
-- 2. call this function | -- 2. call this function | ||||
create or replace function swh_content_missing() | create or replace function swh_content_missing() | ||||
returns setof content_signature | returns setof content_signature | ||||
language plpgsql | language plpgsql | ||||
as $$ | as $$ | ||||
begin | begin | ||||
return query ( | return query ( | ||||
select sha1, sha1_git, sha256 from tmp_content as tmp | select sha1, sha1_git, sha256, blake2s256 from tmp_content as tmp | ||||
where not exists ( | where not exists ( | ||||
select 1 from content as c | select 1 from content as c | ||||
where c.sha1 = tmp.sha1 and c.sha1_git = tmp.sha1_git and c.sha256 = tmp.sha256 | where c.sha1 = tmp.sha1 and | ||||
olasd: This is a "primary key" check. It must not change until all the columns are `not null`, else… | |||||
Not Done Inline ActionsYes indeed! (In a future 105 version for example). ardumont: Yes indeed!
This can be installed once the migration is complete (through the rehash routine is… | |||||
c.sha1_git = tmp.sha1_git and | |||||
c.sha256 = tmp.sha256 | |||||
) | ) | ||||
); | ); | ||||
return; | return; | ||||
end | end | ||||
$$; | $$; | ||||
-- check which entries of tmp_content_sha1 are missing from content | -- check which entries of tmp_content_sha1 are missing from content | ||||
-- | -- | ||||
Show All 18 Lines | |||||
-- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, | -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, | ||||
-- 2. call this function | -- 2. call this function | ||||
create or replace function swh_skipped_content_missing() | create or replace function swh_skipped_content_missing() | ||||
returns setof content_signature | returns setof content_signature | ||||
language plpgsql | language plpgsql | ||||
as $$ | as $$ | ||||
begin | begin | ||||
return query | return query | ||||
select sha1, sha1_git, sha256 from tmp_skipped_content t | select sha1, sha1_git, sha256, blake2s256 from tmp_skipped_content t | ||||
where not exists | where not exists | ||||
(select 1 from skipped_content s where | (select 1 from skipped_content s where | ||||
s.sha1 is not distinct from t.sha1 and | s.sha1 is not distinct from t.sha1 and | ||||
s.sha1_git is not distinct from t.sha1_git and | s.sha1_git is not distinct from t.sha1_git and | ||||
s.sha256 is not distinct from t.sha256); | s.sha256 is not distinct from t.sha256); | ||||
return; | return; | ||||
end | end | ||||
$$; | $$; | ||||
-- Look up content based on one or several different checksums. Return all | -- Look up content based on one or several different checksums. Return all | ||||
-- content information if the content is found; a NULL row otherwise. | -- content information if the content is found; a NULL row otherwise. | ||||
-- | -- | ||||
-- At least one checksum should be not NULL. If several are not NULL, they will | -- At least one checksum should be not NULL. If several are not NULL, they will | ||||
-- be AND-ed together in the lookup query. | -- be AND-ed together in the lookup query. | ||||
-- | -- | ||||
-- Note: this function is meant to be used to look up individual contents | -- Note: this function is meant to be used to look up individual contents | ||||
-- (e.g., for the web app), for batch lookup of missing content (e.g., to be | -- (e.g., for the web app), for batch lookup of missing content (e.g., to be | ||||
-- added) see swh_content_missing | -- added) see swh_content_missing | ||||
create or replace function swh_content_find( | create or replace function swh_content_find( | ||||
sha1 sha1 default NULL, | sha1 sha1 default NULL, | ||||
sha1_git sha1_git default NULL, | sha1_git sha1_git default NULL, | ||||
sha256 sha256 default NULL | sha256 sha256 default NULL, | ||||
blake2s256 blake2s256 default NULL | |||||
) | ) | ||||
returns content | returns content | ||||
language plpgsql | language plpgsql | ||||
as $$ | as $$ | ||||
declare | declare | ||||
con content; | con content; | ||||
filters text[] := array[] :: text[]; -- AND-clauses used to filter content | filters text[] := array[] :: text[]; -- AND-clauses used to filter content | ||||
q text; | q text; | ||||
begin | begin | ||||
if sha1 is not null then | if sha1 is not null then | ||||
filters := filters || format('sha1 = %L', sha1); | filters := filters || format('sha1 = %L', sha1); | ||||
end if; | end if; | ||||
if sha1_git is not null then | if sha1_git is not null then | ||||
filters := filters || format('sha1_git = %L', sha1_git); | filters := filters || format('sha1_git = %L', sha1_git); | ||||
end if; | end if; | ||||
if sha256 is not null then | if sha256 is not null then | ||||
filters := filters || format('sha256 = %L', sha256); | filters := filters || format('sha256 = %L', sha256); | ||||
end if; | end if; | ||||
if blake2s256 is not null then | |||||
filters := filters || format('blake2s256 = %L', blake2s256); | |||||
end if; | |||||
if cardinality(filters) = 0 then | if cardinality(filters) = 0 then | ||||
return null; | return null; | ||||
else | else | ||||
q = format('select * from content where %s', | q = format('select * from content where %s', | ||||
array_to_string(filters, ' and ')); | array_to_string(filters, ' and ')); | ||||
execute q into con; | execute q into con; | ||||
return con; | return con; | ||||
end if; | end if; | ||||
end | end | ||||
$$; | $$; | ||||
-- add tmp_content entries to content, skipping duplicates | -- add tmp_content entries to content, skipping duplicates | ||||
-- | -- | ||||
-- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, | -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content, | ||||
-- 2. call this function | -- 2. call this function | ||||
create or replace function swh_content_add() | create or replace function swh_content_add() | ||||
returns void | returns void | ||||
language plpgsql | language plpgsql | ||||
as $$ | as $$ | ||||
begin | begin | ||||
insert into content (sha1, sha1_git, sha256, length, status) | insert into content (sha1, sha1_git, sha256, blake2s256, length, status) | ||||
select distinct sha1, sha1_git, sha256, length, status | select distinct sha1, sha1_git, sha256, blake2s256, length, status | ||||
from tmp_content | from tmp_content | ||||
where (sha1, sha1_git, sha256) in | where (sha1, sha1_git, sha256, blake2s256) in | ||||
(select * from swh_content_missing()); | (select * from swh_content_missing()); | ||||
-- TODO XXX use postgres 9.5 "UPSERT" support here, when available. | -- TODO XXX use postgres 9.5 "UPSERT" support here, when available. | ||||
-- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid | -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid | ||||
-- the extra swh_content_missing() query here. | -- the extra swh_content_missing() query here. | ||||
return; | return; | ||||
end | end | ||||
Not Done Inline ActionsUnfortunately, for now, this check is not equivalent to the previous one. olasd: Unfortunately, for now, this check is not equivalent to the previous one. | |||||
Not Done Inline ActionsYes, this goes in 105 version as well. ardumont: Yes, this goes in 105 version as well. | |||||
$$; | $$; | ||||
-- add tmp_skipped_content entries to skipped_content, skipping duplicates | -- add tmp_skipped_content entries to skipped_content, skipping duplicates | ||||
-- | -- | ||||
-- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, | -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content, | ||||
-- 2. call this function | -- 2. call this function | ||||
create or replace function swh_skipped_content_add() | create or replace function swh_skipped_content_add() | ||||
returns void | returns void | ||||
language plpgsql | language plpgsql | ||||
as $$ | as $$ | ||||
begin | begin | ||||
insert into skipped_content (sha1, sha1_git, sha256, length, status, reason, origin) | insert into skipped_content (sha1, sha1_git, sha256, blake2s256, length, status, reason, origin) | ||||
select distinct sha1, sha1_git, sha256, length, status, reason, origin | select distinct sha1, sha1_git, sha256, blake2s256, length, status, reason, origin | ||||
from tmp_skipped_content | from tmp_skipped_content | ||||
where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in | where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, ''), coalesce(blake2s256, '')) in | ||||
(select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '') from swh_skipped_content_missing()); | (select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, ''), coalesce(blake2s256, '') from swh_skipped_content_missing()); | ||||
Done Inline Actionsbug coalesce(blake2s256, '') ardumont: bug `coalesce(blake2s256, '')` | |||||
-- TODO XXX use postgres 9.5 "UPSERT" support here, when available. | -- TODO XXX use postgres 9.5 "UPSERT" support here, when available. | ||||
-- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid | -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid | ||||
-- the extra swh_content_missing() query here. | -- the extra swh_content_missing() query here. | ||||
Done Inline ActionsAnd neither is this one. olasd: And neither is this one. | |||||
Done Inline ActionsYes, this goes in 105 version as well. ardumont: Yes, this goes in 105 version as well. | |||||
return; | return; | ||||
end | end | ||||
$$; | $$; | ||||
-- Update content entries from temporary table. | -- Update content entries from temporary table. | ||||
-- (columns are potential new columns added to the schema, this cannot be empty) | -- (columns are potential new columns added to the schema, this cannot be empty) | ||||
-- | -- | ||||
create or replace function swh_content_update(columns_update text[]) | create or replace function swh_content_update(columns_update text[]) | ||||
returns void | returns void | ||||
language plpgsql | language plpgsql | ||||
as $$ | as $$ | ||||
declare | declare | ||||
▲ Show 20 Lines • Show All 1,756 Lines • Show Last 20 Lines |
This is a "primary key" check. It must not change until all the columns are not null, else the equality will fail and we'll get duplicate entries: the old entries with only the old checksums will not match newer entries with the new checksum.