diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -62,7 +62,13 @@ ( id bigserial not null, type text, -- TODO use an enum here (?) - url text not null + url text not null, + tsv tsvector +); + +-- Search helper +create materialized view origin_word_stats as ( + select * from ts_stat('select tsv from origin') ); diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -797,6 +797,53 @@ where origin_visit.origin=origin_id and origin_visit.visit=visit_id; $$; + +-- Visit search related functions + +create or replace function swh_origin_to_tsv(origin origin) + returns tsvector + language plpgsql + stable +as $$ + begin + return + setweight(to_tsvector('simple', origin.type), 'D') + || setweight( + ts_delete( + to_tsvector( + 'simple', + regexp_replace(lower(origin.url), '[^a-z0-9]', ' ', 'g') + ), + -- FIXME: remove top words instead of hardcoded list + array['https', 'github', 'com'] + ), + 'C' + ) + || setweight(to_tsvector('simple', lower(origin.url)), 'A'); + end; +$$; + +create or replace function swh_origin_search_prefix(query text, count int default 100) + returns setof origin + language sql + stable +as $$ + select * from origin + where lower(url) ~~* lower(query) || '%' escape '\' + limit count; +$$; + +create or replace function swh_origin_search_keyword_exact(query tsquery, count int default 100) + returns setof origin + language sql + stable +as $$ + select * from origin + where tsv @@ query + order by ts_rank_cd(tsv, query) desc + limit count; +$$; + -- Absolute path: directory reference + complete path relative to it create type content_dir as ( directory sha1_git, @@ -1055,6 +1102,7 @@ end; $$; + create or replace function swh_update_counter_bucketed() returns void language plpgsql diff --git a/swh/storage/sql/60-swh-indexes.sql b/swh/storage/sql/60-swh-indexes.sql --- a/swh/storage/sql/60-swh-indexes.sql +++ b/swh/storage/sql/60-swh-indexes.sql @@ -14,7 +14,9 @@ create unique index concurrently origin_pkey on origin(id); alter table origin add primary key using index origin_pkey; -create index concurrently on origin(type, url); +create unique index concurrently on origin(type, url); +create index concurrently on origin using btree(lower(url)); +create index concurrently on origin using gin(tsv); -- skipped_content diff --git a/swh/storage/sql/70-swh-triggers.sql b/swh/storage/sql/70-swh-triggers.sql --- a/swh/storage/sql/70-swh-triggers.sql +++ b/swh/storage/sql/70-swh-triggers.sql @@ -1,3 +1,27 @@ +create function origin_update_tsv() + returns trigger + language plpgsql +as $$ + begin + new.tsv := swh_origin_to_tsv(new); + return new; + end +$$; + +create trigger origin_update_tsv + before update + on origin + for each row + when (old.url is distinct from new.url or old.type is distinct from new.type) + execute procedure origin_update_tsv(); + +create trigger origin_insert_tsv + before insert + on origin + for each row + execute procedure origin_update_tsv(); + + -- Asynchronous notification of new content insertions create function notify_new_content() returns trigger