diff --git a/sql/swh-func.sql b/sql/swh-func.sql
index 76a85a72e..43fb9fde1 100644
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -1,1890 +1,1896 @@
 -- create a temporary table called tmp_TBLNAME, mimicking existing table
 -- TBLNAME
 --
 -- Args:
 --     tblname: name of the table to mimick
 create or replace function swh_mktemp(tblname regclass)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
 	create temporary table tmp_%1$I
 	    (like %1$I including defaults)
 	    on commit drop;
       alter table tmp_%1$I drop column if exists object_id;
 	', tblname);
     return;
 end
 $$;
 
 -- create a temporary table for directory entries called tmp_TBLNAME,
 -- mimicking existing table TBLNAME with an extra dir_id (sha1_git)
 -- column, and dropping the id column.
 --
 -- This is used to create the tmp_directory_entry_<foo> tables.
 --
 -- Args:
 --     tblname: name of the table to mimick
 create or replace function swh_mktemp_dir_entry(tblname regclass)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
 	create temporary table tmp_%1$I
 	    (like %1$I including defaults, dir_id sha1_git)
 	    on commit drop;
         alter table tmp_%1$I drop column id;
 	', tblname);
     return;
 end
 $$;
 
 
 -- create a temporary table for revisions called tmp_revisions,
 -- mimicking existing table revision, replacing the foreign keys to
 -- people with an email and name field
 --
 create or replace function swh_mktemp_revision()
     returns void
     language sql
 as $$
     create temporary table tmp_revision (
         like revision including defaults,
         author_fullname bytea,
         author_name bytea,
         author_email bytea,
         committer_fullname bytea,
         committer_name bytea,
         committer_email bytea
     ) on commit drop;
     alter table tmp_revision drop column author;
     alter table tmp_revision drop column committer;
     alter table tmp_revision drop column object_id;
 $$;
 
 
 -- create a temporary table for releases called tmp_release,
 -- mimicking existing table release, replacing the foreign keys to
 -- people with an email and name field
 --
 create or replace function swh_mktemp_release()
     returns void
     language sql
 as $$
     create temporary table tmp_release (
         like release including defaults,
         author_fullname bytea,
         author_name bytea,
         author_email bytea
     ) on commit drop;
     alter table tmp_release drop column author;
     alter table tmp_release drop column object_id;
 $$;
 
 -- create a temporary table with a single "bytea" column for fast object lookup.
 create or replace function swh_mktemp_bytea()
     returns void
     language sql
 as $$
     create temporary table tmp_bytea (
       id bytea
     ) on commit drop;
 $$;
 
 -- create a temporary table for occurrence_history
 create or replace function swh_mktemp_occurrence_history()
     returns void
     language sql
 as $$
     create temporary table tmp_occurrence_history(
         like occurrence_history including defaults,
         visit bigint not null
     ) on commit drop;
     alter table tmp_occurrence_history
       drop column visits,
       drop column object_id;
 $$;
 
 -- create a temporary table for entity_history, sans id
 create or replace function swh_mktemp_entity_history()
     returns void
     language sql
 as $$
     create temporary table tmp_entity_history (
         like entity_history including defaults) on commit drop;
     alter table tmp_entity_history drop column id;
 $$;
 
 -- create a temporary table for entities called tmp_entity_lister,
 -- with only the columns necessary for retrieving the uuid of a listed
 -- entity.
 create or replace function swh_mktemp_entity_lister()
     returns void
     language sql
 as $$
   create temporary table tmp_entity_lister (
     id              bigint,
     lister_metadata jsonb
   ) on commit drop;
 $$;
 
 -- create a temporary table for content_fossology_license tmp_content_fossology_license,
 create or replace function swh_mktemp_content_fossology_license()
     returns void
     language sql
 as $$
   create temporary table tmp_content_fossology_license (
     id           sha1,
     tool_name    text,
     tool_version text,
     license      text
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license';
 
 -- create a temporary table for checking licenses' name
 create or replace function swh_mktemp_content_fossology_license_unknown()
     returns void
     language sql
 as $$
   create temporary table tmp_content_fossology_license_unknown (
     name       text not null
   ) on commit drop;
 $$;
 
 comment on function swh_mktemp_content_fossology_license_unknown() is 'Helper table to list unknown licenses';
 
 
 -- a content signature is a set of cryptographic checksums that we use to
 -- uniquely identify content, for the purpose of verifying if we already have
 -- some content or not during content injection
 create type content_signature as (
     sha1      sha1,
     sha1_git  sha1_git,
     sha256    sha256
 );
 
 
 -- check which entries of tmp_content are missing from content
 --
 -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content,
 -- 2. call this function
 create or replace function swh_content_missing()
     returns setof content_signature
     language plpgsql
 as $$
 begin
     -- This query is critical for (single-algorithm) hash collision detection,
     -- so we cannot rely only on the fact that a single hash (e.g., sha1) is
     -- missing from the table content to conclude that a given content is
     -- missing. Ideally, we would want to (try to) add to content all entries
     -- in tmp_content that, when considering all columns together, are missing
     -- from content.
     --
     -- But doing that naively would require a *compound* index on all checksum
     -- columns; that index would not be significantly smaller than the content
     -- table itself, and therefore won't be used. Therefore we union together
     -- all contents that differ on at least one column from what is already
     -- available. If there is a collision on some (but not all) columns, the
     -- relevant tmp_content entry will be included in the set of content to be
     -- added, causing a downstream violation of unicity constraint.
     return query
 	(select sha1, sha1_git, sha256 from tmp_content as tmp
 	 where not exists
 	     (select 1 from content as c where c.sha1 = tmp.sha1))
 	union
 	(select sha1, sha1_git, sha256 from tmp_content as tmp
 	 where not exists
 	     (select 1 from content as c where c.sha1_git = tmp.sha1_git))
 	union
 	(select sha1, sha1_git, sha256 from tmp_content as tmp
 	 where not exists
 	     (select 1 from content as c where c.sha256 = tmp.sha256));
     return;
 end
 $$;
 
 -- check which entries of tmp_content_sha1 are missing from content
 --
 -- operates in bulk: 0. swh_mktemp_content_sha1(), 1. COPY to tmp_content_sha1,
 -- 2. call this function
 create or replace function swh_content_missing_per_sha1()
     returns setof sha1
     language plpgsql
 as $$
 begin
     return query
            (select id::sha1
             from tmp_bytea as tmp
             where not exists
             (select 1 from content as c where c.sha1=tmp.id));
 end
 $$;
 
 
 -- check which entries of tmp_skipped_content are missing from skipped_content
 --
 -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
 -- 2. call this function
 create or replace function swh_skipped_content_missing()
     returns setof content_signature
     language plpgsql
 as $$
 begin
     return query
 	select sha1, sha1_git, sha256 from tmp_skipped_content t
 	where not exists
 	(select 1 from skipped_content s where
 	    s.sha1 is not distinct from t.sha1 and
 	    s.sha1_git is not distinct from t.sha1_git and
 	    s.sha256 is not distinct from t.sha256);
     return;
 end
 $$;
 
 
 -- Look up content based on one or several different checksums. Return all
 -- content information if the content is found; a NULL row otherwise.
 --
 -- At least one checksum should be not NULL. If several are not NULL, they will
 -- be AND-ed together in the lookup query.
 --
 -- Note: this function is meant to be used to look up individual contents
 -- (e.g., for the web app), for batch lookup of missing content (e.g., to be
 -- added) see swh_content_missing
 create or replace function swh_content_find(
     sha1      sha1     default NULL,
     sha1_git  sha1_git default NULL,
     sha256    sha256   default NULL
 )
     returns content
     language plpgsql
 as $$
 declare
     con content;
     filters text[] := array[] :: text[];  -- AND-clauses used to filter content
     q text;
 begin
     if sha1 is not null then
         filters := filters || format('sha1 = %L', sha1);
     end if;
     if sha1_git is not null then
         filters := filters || format('sha1_git = %L', sha1_git);
     end if;
     if sha256 is not null then
         filters := filters || format('sha256 = %L', sha256);
     end if;
 
     if cardinality(filters) = 0 then
         return null;
     else
         q = format('select * from content where %s',
 	        array_to_string(filters, ' and '));
         execute q into con;
 	return con;
     end if;
 end
 $$;
 
 
 -- add tmp_content entries to content, skipping duplicates
 --
 -- operates in bulk: 0. swh_mktemp(content), 1. COPY to tmp_content,
 -- 2. call this function
 create or replace function swh_content_add()
     returns void
     language plpgsql
 as $$
 begin
     insert into content (sha1, sha1_git, sha256, length, status)
 	select distinct sha1, sha1_git, sha256, length, status
 	from tmp_content
 	where (sha1, sha1_git, sha256) in
 	    (select * from swh_content_missing());
 	    -- TODO XXX use postgres 9.5 "UPSERT" support here, when available.
 	    -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid
 	    -- the extra swh_content_missing() query here.
     return;
 end
 $$;
 
 
 -- add tmp_skipped_content entries to skipped_content, skipping duplicates
 --
 -- operates in bulk: 0. swh_mktemp(skipped_content), 1. COPY to tmp_skipped_content,
 -- 2. call this function
 create or replace function swh_skipped_content_add()
     returns void
     language plpgsql
 as $$
 begin
     insert into skipped_content (sha1, sha1_git, sha256, length, status, reason, origin)
 	select distinct sha1, sha1_git, sha256, length, status, reason, origin
 	from tmp_skipped_content
 	where (coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '')) in
 	    (select coalesce(sha1, ''), coalesce(sha1_git, ''), coalesce(sha256, '') from swh_skipped_content_missing());
 	    -- TODO XXX use postgres 9.5 "UPSERT" support here, when available.
 	    -- Specifically, using "INSERT .. ON CONFLICT IGNORE" we can avoid
 	    -- the extra swh_content_missing() query here.
     return;
 end
 $$;
 
 
 -- check which entries of tmp_directory are missing from directory
 --
 -- operates in bulk: 0. swh_mktemp(directory), 1. COPY to tmp_directory,
 -- 2. call this function
 create or replace function swh_directory_missing()
     returns setof sha1_git
     language plpgsql
 as $$
 begin
     return query
 	select id from tmp_directory t
 	where not exists (
 	    select 1 from directory d
 	    where d.id = t.id);
     return;
 end
 $$;
 
 
 -- Retrieve information on directory from temporary table
 create or replace function swh_directory_get()
     returns setof directory
     language plpgsql
 as $$
 begin
     return query
 	select d.*
         from tmp_directory t
         inner join directory d on t.id = d.id;
     return;
 end
 $$;
 
 
 create type directory_entry_type as enum('file', 'dir', 'rev');
 
 
 -- Add tmp_directory_entry_* entries to directory_entry_* and directory,
 -- skipping duplicates in directory_entry_*.  This is a generic function that
 -- works on all kind of directory entries.
 --
 -- operates in bulk: 0. swh_mktemp_dir_entry('directory_entry_*'), 1 COPY to
 -- tmp_directory_entry_*, 2. call this function
 --
 -- Assumption: this function is used in the same transaction that inserts the
 -- context directory in table "directory".
 create or replace function swh_directory_entry_add(typ directory_entry_type)
     returns void
     language plpgsql
 as $$
 begin
     execute format('
     insert into directory_entry_%1$s (target, name, perms)
     select distinct t.target, t.name, t.perms
     from tmp_directory_entry_%1$s t
     where not exists (
     select 1
     from directory_entry_%1$s i
     where t.target = i.target and t.name = i.name and t.perms = i.perms)
    ', typ);
 
     execute format('
     with new_entries as (
 	select t.dir_id, array_agg(i.id) as entries
 	from tmp_directory_entry_%1$s t
 	inner join directory_entry_%1$s i
 	using (target, name, perms)
 	group by t.dir_id
     )
     update tmp_directory as d
     set %1$s_entries = new_entries.entries
     from new_entries
     where d.id = new_entries.dir_id
     ', typ);
 
     return;
 end
 $$;
 
 -- Insert the data from tmp_directory, tmp_directory_entry_file,
 -- tmp_directory_entry_dir, tmp_directory_entry_rev into their final
 -- tables.
 --
 -- Prerequisites:
 --  directory ids in tmp_directory
 --  entries in tmp_directory_entry_{file,dir,rev}
 --
 create or replace function swh_directory_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_directory_entry_add('file');
     perform swh_directory_entry_add('dir');
     perform swh_directory_entry_add('rev');
 
     insert into directory
     select * from tmp_directory t
     where not exists (
         select 1 from directory d
 	where d.id = t.id);
 
     return;
 end
 $$;
 
 -- a directory listing entry with all the metadata
 --
 -- can be used to list a directory, and retrieve all the data in one go.
 create type directory_entry as
 (
   dir_id   sha1_git,     -- id of the parent directory
   type     directory_entry_type,  -- type of entry
   target   sha1_git,     -- id of target
   name     unix_path,    -- path name, relative to containing dir
   perms    file_perms,   -- unix-like permissions
   status   content_status,  -- visible or absent
   sha1     sha1,            -- content if sha1 if type is not dir
   sha1_git sha1_git,        -- content's sha1 git if type is not dir
   sha256   sha256           -- content's sha256 if type is not dir
 );
 
 
 -- List a single level of directory walked_dir_id
 -- FIXME: order by name is not correct. For git, we need to order by
 -- lexicographic order but as if a trailing / is present in directory
 -- name
 create or replace function swh_directory_walk_one(walked_dir_id sha1_git)
     returns setof directory_entry
     language sql
     stable
 as $$
     with dir as (
 	select id as dir_id, dir_entries, file_entries, rev_entries
 	from directory
 	where id = walked_dir_id),
     ls_d as (select dir_id, unnest(dir_entries) as entry_id from dir),
     ls_f as (select dir_id, unnest(file_entries) as entry_id from dir),
     ls_r as (select dir_id, unnest(rev_entries) as entry_id from dir)
     (select dir_id, 'dir'::directory_entry_type as type,
             e.target, e.name, e.perms, NULL::content_status,
             NULL::sha1, NULL::sha1_git, NULL::sha256
      from ls_d
      left join directory_entry_dir e on ls_d.entry_id = e.id)
     union
     (select dir_id, 'file'::directory_entry_type as type,
             e.target, e.name, e.perms, c.status,
             c.sha1, c.sha1_git, c.sha256
      from ls_f
      left join directory_entry_file e on ls_f.entry_id = e.id
      left join content c on e.target = c.sha1_git)
     union
     (select dir_id, 'rev'::directory_entry_type as type,
             e.target, e.name, e.perms, NULL::content_status,
             NULL::sha1, NULL::sha1_git, NULL::sha256
      from ls_r
      left join directory_entry_rev e on ls_r.entry_id = e.id)
     order by name;
 $$;
 
 -- List recursively the revision directory arborescence
 create or replace function swh_directory_walk(walked_dir_id sha1_git)
     returns setof directory_entry
     language sql
     stable
 as $$
     with recursive entries as (
         select dir_id, type, target, name, perms, status, sha1, sha1_git,
                sha256
         from swh_directory_walk_one(walked_dir_id)
         union all
         select dir_id, type, target, (dirname || '/' || name)::unix_path as name,
                perms, status, sha1, sha1_git, sha256
         from (select (swh_directory_walk_one(dirs.target)).*, dirs.name as dirname
               from (select target, name from entries where type = 'dir') as dirs) as with_parent
     )
     select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256
     from entries
 $$;
 
 create or replace function swh_revision_walk(revision_id sha1_git)
   returns setof directory_entry
   language sql
   stable
 as $$
   select dir_id, type, target, name, perms, status, sha1, sha1_git, sha256
   from swh_directory_walk((select directory from revision where id=revision_id))
 $$;
 
 COMMENT ON FUNCTION swh_revision_walk(sha1_git) IS 'Recursively list the revision targeted directory arborescence';
 
 
 -- Find a directory entry by its path
 create or replace function swh_find_directory_entry_by_path(
     walked_dir_id sha1_git,
     dir_or_content_path bytea[])
     returns directory_entry
     language plpgsql
 as $$
 declare
     end_index integer;
     paths bytea default '';
     path bytea;
     res bytea[];
     r record;
 begin
     end_index := array_upper(dir_or_content_path, 1);
     res[1] := walked_dir_id;
 
     for i in 1..end_index
     loop
         path := dir_or_content_path[i];
         -- concatenate path for patching the name in the result record (if we found it)
         if i = 1 then
             paths = path;
         else
             paths := paths || '/' || path;  -- concatenate paths
         end if;
 
         if i <> end_index then
             select *
             from swh_directory_walk_one(res[i] :: sha1_git)
             where name=path
             and type = 'dir'
             limit 1 into r;
         else
             select *
             from swh_directory_walk_one(res[i] :: sha1_git)
             where name=path
             limit 1 into r;
         end if;
 
         -- find the path
         if r is null then
            return null;
         else
             -- store the next dir to lookup the next local path from
             res[i+1] := r.target;
         end if;
     end loop;
 
     -- at this moment, r is the result. Patch its 'name' with the full path before returning it.
     r.name := paths;
     return r;
 end
 $$;
 
 -- List all revision IDs starting from a given revision, going back in time
 --
 -- TODO ordering: should be breadth-first right now (what do we want?)
 -- TODO ordering: ORDER BY parent_rank somewhere?
 create or replace function swh_revision_list(root_revisions bytea[], num_revs bigint default NULL)
     returns table (id sha1_git, parents bytea[])
     language sql
     stable
 as $$
     with recursive full_rev_list(id) as (
         (select id from revision where id = ANY(root_revisions))
         union
         (select h.parent_id
          from revision_history as h
          join full_rev_list on h.id = full_rev_list.id)
     ),
     rev_list as (select id from full_rev_list limit num_revs)
     select rev_list.id as id,
            array(select rh.parent_id::bytea
                  from revision_history rh
                  where rh.id = rev_list.id
                  order by rh.parent_rank
                 ) as parent
     from rev_list;
 $$;
 
 -- List all the children of a given revision
 create or replace function swh_revision_list_children(root_revisions bytea[], num_revs bigint default NULL)
     returns table (id sha1_git, parents bytea[])
     language sql
     stable
 as $$
     with recursive full_rev_list(id) as (
         (select id from revision where id = ANY(root_revisions))
         union
         (select h.id
          from revision_history as h
          join full_rev_list on h.parent_id = full_rev_list.id)
     ),
     rev_list as (select id from full_rev_list limit num_revs)
     select rev_list.id as id,
            array(select rh.parent_id::bytea
                  from revision_history rh
                  where rh.id = rev_list.id
                  order by rh.parent_rank
                 ) as parent
     from rev_list;
 $$;
 
 
 -- Detailed entry for a revision
 create type revision_entry as
 (
   id                             sha1_git,
   date                           timestamptz,
   date_offset                    smallint,
   date_neg_utc_offset            boolean,
   committer_date                 timestamptz,
   committer_date_offset          smallint,
   committer_date_neg_utc_offset  boolean,
   type                           revision_type,
   directory                      sha1_git,
   message                        bytea,
   author_id                      bigint,
   author_fullname                bytea,
   author_name                    bytea,
   author_email                   bytea,
   committer_id                   bigint,
   committer_fullname             bytea,
   committer_name                 bytea,
   committer_email                bytea,
   metadata                       jsonb,
   synthetic                      boolean,
   parents                        bytea[],
   object_id                      bigint
 );
 
 
 -- "git style" revision log. Similar to swh_revision_list(), but returning all
 -- information associated to each revision, and expanding authors/committers
 create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
     returns setof revision_entry
     language sql
     stable
 as $$
     select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
            r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
            r.type, r.directory, r.message,
            a.id, a.fullname, a.name, a.email,
            c.id, c.fullname, c.name, c.email,
            r.metadata, r.synthetic, t.parents, r.object_id
     from swh_revision_list(root_revisions, num_revs) as t
     left join revision r on t.id = r.id
     left join person a on a.id = r.author
     left join person c on c.id = r.committer;
 $$;
 
 
 -- Retrieve revisions from tmp_bytea in bulk
 create or replace function swh_revision_get()
     returns setof revision_entry
     language plpgsql
 as $$
 begin
     return query
         select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
                r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
                r.type, r.directory, r.message,
                a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
          array(select rh.parent_id::bytea from revision_history rh where rh.id = t.id order by rh.parent_rank)
                    as parents, r.object_id
         from tmp_bytea t
         left join revision r on t.id = r.id
         left join person a on a.id = r.author
         left join person c on c.id = r.committer;
     return;
 end
 $$;
 
 -- List missing revisions from tmp_bytea
 create or replace function swh_revision_missing()
     returns setof sha1_git
     language plpgsql
 as $$
 begin
     return query
         select id::sha1_git from tmp_bytea t
 	where not exists (
 	    select 1 from revision r
 	    where r.id = t.id);
     return;
 end
 $$;
 
 -- Detailed entry for a release
 create type release_entry as
 (
   id                   sha1_git,
   target               sha1_git,
   target_type          object_type,
   date                 timestamptz,
   date_offset          smallint,
   date_neg_utc_offset  boolean,
   name                 bytea,
   comment              bytea,
   synthetic            boolean,
   author_id            bigint,
   author_fullname      bytea,
   author_name          bytea,
   author_email         bytea,
   object_id            bigint
 );
 
 -- Detailed entry for release
 create or replace function swh_release_get()
     returns setof release_entry
     language plpgsql
 as $$
 begin
     return query
         select r.id, r.target, r.target_type, r.date, r.date_offset, r.date_neg_utc_offset, r.name, r.comment,
                r.synthetic, p.id as author_id, p.fullname as author_fullname, p.name as author_name, p.email as author_email, r.object_id
         from tmp_bytea t
         inner join release r on t.id = r.id
         inner join person p on p.id = r.author;
     return;
 end
 $$;
 
 -- Create entries in person from tmp_revision
 create or replace function swh_person_add_from_revision()
     returns void
     language plpgsql
 as $$
 begin
     with t as (
         select author_fullname as fullname, author_name as name, author_email as email from tmp_revision
     union
         select committer_fullname as fullname, committer_name as name, committer_email as email from tmp_revision
     ) insert into person (fullname, name, email)
     select distinct fullname, name, email from t
     where not exists (
         select 1
         from person p
         where t.fullname = p.fullname
     );
     return;
 end
 $$;
 
 
 -- Create entries in revision from tmp_revision
 create or replace function swh_revision_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_person_add_from_revision();
 
     insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic)
     select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic
     from tmp_revision t
     left join person a on a.fullname = t.author_fullname
     left join person c on c.fullname = t.committer_fullname;
     return;
 end
 $$;
 
 
 -- List missing releases from tmp_bytea
 create or replace function swh_release_missing()
     returns setof sha1_git
     language plpgsql
 as $$
 begin
   return query
     select id::sha1_git from tmp_bytea t
     where not exists (
       select 1 from release r
       where r.id = t.id);
 end
 $$;
 
 
 -- Create entries in person from tmp_release
 create or replace function swh_person_add_from_release()
     returns void
     language plpgsql
 as $$
 begin
     with t as (
         select distinct author_fullname as fullname, author_name as name, author_email as email from tmp_release
     ) insert into person (fullname, name, email)
     select fullname, name, email from t
     where not exists (
         select 1
         from person p
         where t.fullname = p.fullname
     );
     return;
 end
 $$;
 
 
 -- Create entries in release from tmp_release
 create or replace function swh_release_add()
     returns void
     language plpgsql
 as $$
 begin
     perform swh_person_add_from_release();
 
     insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic)
     select t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic
     from tmp_release t
     left join person a on a.fullname = t.author_fullname;
     return;
 end
 $$;
 
 create or replace function swh_occurrence_update_for_origin(origin_id bigint)
   returns void
   language sql
 as $$
   delete from occurrence where origin = origin_id;
   insert into occurrence (origin, branch, target, target_type)
     select origin, branch, target, target_type
     from occurrence_history
     where origin = origin_id and
           (select visit from origin_visit
            where origin = origin_id
            order by date desc
            limit 1) = any(visits);
 $$;
 
 create or replace function swh_occurrence_update_all()
   returns void
   language plpgsql
 as $$
 declare
   origin_id origin.id%type;
 begin
   for origin_id in
     select distinct id from origin
   loop
     perform swh_occurrence_update_for_origin(origin_id);
   end loop;
   return;
 end;
 $$;
 
 -- add a new origin_visit for origin origin_id at date.
 --
 -- Returns the new visit id.
 create or replace function swh_origin_visit_add(origin_id bigint, date timestamptz)
     returns bigint
     language sql
 as $$
   with last_known_visit as (
     select coalesce(max(visit), 0) as visit
     from origin_visit
     where origin = origin_id
   )
   insert into origin_visit (origin, date, visit, status)
   values (origin_id, date, (select visit from last_known_visit) + 1, 'ongoing')
   returning visit;
 $$;
 
 -- add tmp_occurrence_history entries to occurrence_history
 --
 -- operates in bulk: 0. swh_mktemp(occurrence_history), 1. COPY to tmp_occurrence_history,
 -- 2. call this function
 create or replace function swh_occurrence_history_add()
     returns void
     language plpgsql
 as $$
 declare
   origin_id origin.id%type;
 begin
   -- Create or update occurrence_history
   with occurrence_history_id_visit as (
     select tmp_occurrence_history.*, object_id, visits from tmp_occurrence_history
     left join occurrence_history using(origin, branch, target, target_type)
   ),
   occurrences_to_update as (
     select object_id, visit from occurrence_history_id_visit where object_id is not null
   ),
   update_occurrences as (
     update occurrence_history
     set visits = array(select unnest(occurrence_history.visits) as e
                         union
                        select occurrences_to_update.visit as e
                        order by e)
     from occurrences_to_update
     where occurrence_history.object_id = occurrences_to_update.object_id
   )
   insert into occurrence_history (origin, branch, target, target_type, visits)
     select origin, branch, target, target_type, ARRAY[visit]
       from occurrence_history_id_visit
       where object_id is null;
 
   -- update occurrence
   for origin_id in
     select distinct origin from tmp_occurrence_history
   loop
     perform swh_occurrence_update_for_origin(origin_id);
   end loop;
   return;
 end
 $$;
 
 
 -- Absolute path: directory reference + complete path relative to it
 create type content_dir as (
     directory  sha1_git,
     path       unix_path
 );
 
 
 -- Find the containing directory of a given content, specified by sha1
 -- (note: *not* sha1_git).
 --
 -- Return a pair (dir_it, path) where path is a UNIX path that, from the
 -- directory root, reach down to a file with the desired content. Return NULL
 -- if no match is found.
 --
 -- In case of multiple paths (i.e., pretty much always), an arbitrary one is
 -- chosen.
 create or replace function swh_content_find_directory(content_id sha1)
     returns content_dir
     language sql
     stable
 as $$
     with recursive path as (
 	-- Recursively build a path from the requested content to a root
 	-- directory. Each iteration returns a pair (dir_id, filename) where
 	-- filename is relative to dir_id. Stops when no parent directory can
 	-- be found.
 	(select dir.id as dir_id, dir_entry_f.name as name, 0 as depth
 	 from directory_entry_file as dir_entry_f
 	 join content on content.sha1_git = dir_entry_f.target
 	 join directory as dir on dir.file_entries @> array[dir_entry_f.id]
 	 where content.sha1 = content_id
 	 limit 1)
 	union all
 	(select dir.id as dir_id,
 		(dir_entry_d.name || '/' || path.name)::unix_path as name,
 		path.depth + 1
 	 from path
 	 join directory_entry_dir as dir_entry_d on dir_entry_d.target = path.dir_id
 	 join directory as dir on dir.dir_entries @> array[dir_entry_d.id]
 	 limit 1)
     )
     select dir_id, name from path order by depth desc limit 1;
 $$;
 
 
 -- Walk the revision history starting from a given revision, until a matching
 -- occurrence is found. Return all occurrence information if one is found, NULL
 -- otherwise.
 create or replace function swh_revision_find_occurrence(revision_id sha1_git)
     returns occurrence
     language sql
     stable
 as $$
 	select origin, branch, target, target_type
   from swh_revision_list_children(ARRAY[revision_id] :: bytea[]) as rev_list
 	left join occurrence_history occ_hist
   on rev_list.id = occ_hist.target
 	where occ_hist.origin is not null and
         occ_hist.target_type = 'revision'
 	limit 1;
 $$;
 
 -- Find the visit of origin id closest to date visit_date
 create or replace function swh_visit_find_by_date(origin bigint, visit_date timestamptz default NOW())
     returns origin_visit
     language sql
     stable
 as $$
   with closest_two_visits as ((
     select origin_visit, (date - visit_date) as interval
     from origin_visit
     where date >= visit_date
     order by date asc
     limit 1
   ) union (
     select origin_visit, (visit_date - date) as interval
     from origin_visit
     where date < visit_date
     order by date desc
     limit 1
   )) select (origin_visit).* from closest_two_visits order by interval limit 1
 $$;
 
 -- Find the visit of origin id closest to date visit_date
 create or replace function swh_visit_get(origin bigint)
     returns origin_visit
     language sql
     stable
 as $$
     select *
     from origin_visit
     where origin=origin
     order by date desc
 $$;
 
 
 -- Retrieve occurrence by filtering on origin_id and optionally on
 -- branch_name and/or validity range
 create or replace function swh_occurrence_get_by(
        origin_id bigint,
        branch_name bytea default NULL,
        date timestamptz default NULL)
     returns setof occurrence_history
     language plpgsql
 as $$
 declare
     filters text[] := array[] :: text[];  -- AND-clauses used to filter content
     visit_id bigint;
     q text;
 begin
     if origin_id is not null then
         filters := filters || format('origin = %L', origin_id);
     end if;
     if branch_name is not null then
         filters := filters || format('branch = %L', branch_name);
     end if;
     if date is not null then
         if origin_id is null then
             raise exception 'Needs an origin_id to filter by date.';
         end if;
         select visit from swh_visit_find_by_date(origin_id, date) into visit_id;
         if visit_id is null then
             return;
         end if;
         filters := filters || format('%L = any(visits)', visit_id);
     end if;
 
     if cardinality(filters) = 0 then
         raise exception 'At least one filter amongst (origin_id, branch_name, date) is needed';
     else
         q = format('select * ' ||
                    'from occurrence_history ' ||
                    'where %s',
 	        array_to_string(filters, ' and '));
         return query execute q;
     end if;
 end
 $$;
 
 
 -- Retrieve revisions by occurrence criterion filtering
 create or replace function swh_revision_get_by(
        origin_id bigint,
        branch_name bytea default NULL,
        date timestamptz default NULL)
     returns setof revision_entry
     language sql
     stable
 as $$
     select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
         r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
         r.type, r.directory, r.message,
         a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
         array(select rh.parent_id::bytea
             from revision_history rh
             where rh.id = r.id
             order by rh.parent_rank
         ) as parents, r.object_id
     from swh_occurrence_get_by(origin_id, branch_name, date) as occ
     inner join revision r on occ.target = r.id
     left join person a on a.id = r.author
     left join person c on c.id = r.committer;
 $$;
 
 -- Retrieve a release by occurrence criterion
 create or replace function swh_release_get_by(
        origin_id bigint)
     returns setof release_entry
     language sql
     stable
 as $$
    select r.id, r.target, r.target_type, r.date, r.date_offset, r.date_neg_utc_offset,
         r.name, r.comment, r.synthetic, a.id as author_id, a.fullname as author_fullname,
         a.name as author_name, a.email as author_email, r.object_id
     from release r
     inner join occurrence_history occ on occ.target = r.target
     left join person a on a.id = r.author
     where occ.origin = origin_id and occ.target_type = 'revision' and r.target_type = 'revision';
 $$;
 
 
 create type content_provenance as (
   content  sha1_git,
   revision sha1_git,
   origin   bigint,
   visit    bigint,
   path     unix_path
 );
 
 COMMENT ON TYPE content_provenance IS 'Provenance information on content';
 
 create or replace function swh_content_find_provenance(content_id sha1_git)
     returns setof content_provenance
     language sql
 as $$
     with subscripted_paths as (
         select content, revision_paths, generate_subscripts(revision_paths, 1) as s
         from cache_content_revision
         where content = content_id
     ),
     cleaned_up_contents as (
         select content, revision_paths[s][1]::sha1_git as revision, revision_paths[s][2]::unix_path as path
         from subscripted_paths
     )
     select cuc.content, cuc.revision, cro.origin, cro.visit, cuc.path
     from cleaned_up_contents cuc
     inner join cache_revision_origin cro using(revision)
 $$;
 
 COMMENT ON FUNCTION swh_content_find_provenance(sha1_git) IS 'Given a content, provide provenance information on it';
 
 
 create type object_found as (
     sha1_git   sha1_git,
     type       object_type,
     id         bytea,       -- sha1 or sha1_git depending on object_type
     object_id  bigint
 );
 
 -- Find objects by sha1_git, return their type and their main identifier
 create or replace function swh_object_find_by_sha1_git()
     returns setof object_found
     language plpgsql
 as $$
 begin
     return query
     with known_objects as ((
         select id as sha1_git, 'release'::object_type as type, id, object_id from release r
         where exists (select 1 from tmp_bytea t where t.id = r.id)
     ) union all (
         select id as sha1_git, 'revision'::object_type as type, id, object_id from revision r
         where exists (select 1 from tmp_bytea t where t.id = r.id)
     ) union all (
         select id as sha1_git, 'directory'::object_type as type, id, object_id from directory d
         where exists (select 1 from tmp_bytea t where t.id = d.id)
     ) union all (
         select sha1_git as sha1_git, 'content'::object_type as type, sha1 as id, object_id from content c
         where exists (select 1 from tmp_bytea t where t.id = c.sha1_git)
     ))
     select t.id::sha1_git as sha1_git, k.type, k.id, k.object_id from tmp_bytea t
       left join known_objects k on t.id = k.sha1_git;
 end
 $$;
 
 -- Create entries in entity_history from tmp_entity_history
 --
 -- TODO: do something smarter to compress the entries if the data
 -- didn't change.
 create or replace function swh_entity_history_add()
     returns void
     language plpgsql
 as $$
 begin
     insert into entity_history (
         uuid, parent, name, type, description, homepage, active, generated, lister_metadata, metadata, validity
     ) select * from tmp_entity_history;
     return;
 end
 $$;
 
 
 create or replace function swh_update_entity_from_entity_history()
     returns trigger
     language plpgsql
 as $$
 begin
     insert into entity (uuid, parent, name, type, description, homepage, active, generated,
       lister_metadata, metadata, last_seen, last_id)
       select uuid, parent, name, type, description, homepage, active, generated,
              lister_metadata, metadata, unnest(validity), id
       from entity_history
       where uuid = NEW.uuid
       order by unnest(validity) desc limit 1
     on conflict (uuid) do update set
       parent = EXCLUDED.parent,
       name = EXCLUDED.name,
       type = EXCLUDED.type,
       description = EXCLUDED.description,
       homepage = EXCLUDED.homepage,
       active = EXCLUDED.active,
       generated = EXCLUDED.generated,
       lister_metadata = EXCLUDED.lister_metadata,
       metadata = EXCLUDED.metadata,
       last_seen = EXCLUDED.last_seen,
       last_id = EXCLUDED.last_id;
 
     return null;
 end
 $$;
 
 create trigger update_entity
   after insert or update
   on entity_history
   for each row
   execute procedure swh_update_entity_from_entity_history();
 
 -- map an id of tmp_entity_lister to a full entity
 create type entity_id as (
     id               bigint,
     uuid             uuid,
     parent           uuid,
     name             text,
     type             entity_type,
     description      text,
     homepage         text,
     active           boolean,
     generated        boolean,
     lister_metadata  jsonb,
     metadata         jsonb,
     last_seen        timestamptz,
     last_id          bigint
 );
 
 -- find out the uuid of the entries of entity with the metadata
 -- contained in tmp_entity_lister
 create or replace function swh_entity_from_tmp_entity_lister()
     returns setof entity_id
     language plpgsql
 as $$
 begin
   return query
     select t.id, e.*
     from tmp_entity_lister t
     left join entity e
     on e.lister_metadata @> t.lister_metadata;
   return;
 end
 $$;
 
 create or replace function swh_entity_get(entity_uuid uuid)
     returns setof entity
     language sql
     stable
 as $$
   with recursive entity_hierarchy as (
   select e.*
     from entity e where uuid = entity_uuid
     union
     select p.*
     from entity_hierarchy e
     join entity p on e.parent = p.uuid
   )
   select *
   from entity_hierarchy;
 $$;
 
 
 -- Object listing by object_id
 
 create or replace function swh_content_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof content
     language sql
     stable
 as $$
     select * from content
     where object_id > min_excl and object_id <= max_incl
     order by object_id;
 $$;
 
 create or replace function swh_revision_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof revision_entry
     language sql
     stable
 as $$
     with revs as (
         select * from revision
         where object_id > min_excl and object_id <= max_incl
     )
     select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
            r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
            r.type, r.directory, r.message,
            a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
            array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
                as parents, r.object_id
     from revs r
     left join person a on a.id = r.author
     left join person c on c.id = r.committer
     order by r.object_id;
 $$;
 
 create or replace function swh_release_list_by_object_id(
     min_excl bigint,
     max_incl bigint
 )
     returns setof release_entry
     language sql
     stable
 as $$
     with rels as (
         select * from release
         where object_id > min_excl and object_id <= max_incl
     )
     select r.id, r.target, r.target_type, r.date, r.date_offset, r.date_neg_utc_offset, r.name, r.comment,
            r.synthetic, p.id as author_id, p.fullname as author_fullname, p.name as author_name, p.email as author_email, r.object_id
     from rels r
     left join person p on p.id = r.author
     order by r.object_id;
 $$;
 
 
 create or replace function swh_cache_content_revision_add()
     returns void
     language plpgsql
 as $$
 declare
   cnt bigint;
   d sha1_git;
 begin
   delete from tmp_bytea t where exists (select 1 from cache_content_revision_processed ccrp where t.id = ccrp.revision);
 
   select count(*) from tmp_bytea into cnt;
   if cnt <> 0 then
     create temporary table tmp_ccr (
         content sha1_git,
         directory sha1_git,
         path unix_path
     ) on commit drop;
 
     create temporary table tmp_ccrd (
         directory sha1_git,
         revision sha1_git
     ) on commit drop;
 
     insert into tmp_ccrd
       select directory, id as revision
       from tmp_bytea
       inner join revision using(id);
 
     insert into cache_content_revision_processed
       select distinct id from tmp_bytea order by id;
 
     for d in
       select distinct directory from tmp_ccrd
     loop
       insert into tmp_ccr
         select sha1_git as content, d as directory, name as path
         from swh_directory_walk(d)
         where type='file';
     end loop;
 
     with revision_contents as (
       select content, false as blacklisted, array_agg(ARRAY[revision::bytea, path::bytea]) as revision_paths
       from tmp_ccr
       inner join tmp_ccrd using (directory)
       group by content
       order by content
     ), updated_cache_entries as (
       update cache_content_revision ccr
       set revision_paths = ccr.revision_paths || rc.revision_paths
       from revision_contents rc
       where ccr.content = rc.content and ccr.blacklisted = false
       returning ccr.content
     ) insert into cache_content_revision
         select * from revision_contents rc
         where not exists (select 1 from updated_cache_entries uce where uce.content = rc.content)
         order by rc.content
       on conflict (content) do update
         set revision_paths = cache_content_revision.revision_paths || EXCLUDED.revision_paths
         where cache_content_revision.blacklisted = false;
     return;
   else
     return;
   end if;
 end
 $$;
 
 COMMENT ON FUNCTION swh_cache_content_revision_add() IS 'Cache the revisions from tmp_bytea into cache_content_revision';
 
 
 create or replace function swh_occurrence_by_origin_visit(origin_id bigint, visit_id bigint)
     returns setof occurrence
     language sql
     stable
 as $$
   select origin, branch, target, target_type from occurrence_history
   where origin = origin_id and visit_id = ANY(visits);
 $$;
 
 create type cache_content_signature as (
   sha1      sha1,
   sha1_git  sha1_git,
   sha256    sha256,
   revision_paths  bytea[][]
 );
 
 create or replace function swh_cache_content_get_all()
        returns setof cache_content_signature
        language sql
        stable
 as $$
     SELECT c.sha1, c.sha1_git, c.sha256, ccr.revision_paths
     FROM cache_content_revision ccr
     INNER JOIN content as c
     ON ccr.content = c.sha1_git
 $$;
 
 COMMENT ON FUNCTION swh_cache_content_get_all() IS 'Retrieve batch of contents';
 
 
 create or replace function swh_cache_content_get(target sha1_git)
        returns setof cache_content_signature
        language sql
        stable
 as $$
     SELECT c.sha1, c.sha1_git, c.sha256, ccr.revision_paths
     FROM cache_content_revision ccr
     INNER JOIN content as c
     ON ccr.content = c.sha1_git
     where ccr.content = target
 $$;
 
 COMMENT ON FUNCTION swh_cache_content_get(sha1_git) IS 'Retrieve cache content information';
 
 create or replace function swh_revision_from_target(target sha1_git, target_type object_type)
     returns sha1_git
     language plpgsql
 as $$
 #variable_conflict use_variable
 begin
    while target_type = 'release' loop
        select r.target, r.target_type from release r where r.id = target into target, target_type;
    end loop;
    if target_type = 'revision' then
        return target;
    else
        return null;
    end if;
 end
 $$;
 
 create or replace function swh_cache_revision_origin_add(origin_id bigint, visit_id bigint)
     returns setof sha1_git
     language plpgsql
 as $$
 declare
     visit_exists bool;
 begin
   select true from origin_visit where origin = origin_id and visit = visit_id into visit_exists;
 
   if not visit_exists then
       return;
   end if;
 
   visit_exists := null;
 
   select true from cache_revision_origin where origin = origin_id and visit = visit_id limit 1 into visit_exists;
 
   if visit_exists then
       return;
   end if;
 
   return query with new_pointed_revs as (
     select swh_revision_from_target(target, target_type) as id
     from swh_occurrence_by_origin_visit(origin_id, visit_id)
   ),
   old_pointed_revs as (
     select swh_revision_from_target(target, target_type) as id
     from swh_occurrence_by_origin_visit(origin_id,
       (select visit from origin_visit where origin = origin_id and visit < visit_id order by visit desc limit 1))
   ),
   new_revs as (
     select distinct id
     from swh_revision_list(array(select id::bytea from new_pointed_revs where id is not null))
   ),
   old_revs as (
     select distinct id
     from swh_revision_list(array(select id::bytea from old_pointed_revs where id is not null))
   )
   insert into cache_revision_origin (revision, origin, visit)
   select n.id as revision, origin_id, visit_id from new_revs n
     where not exists (
     select 1 from old_revs o
     where o.id = n.id)
    returning revision;
 end
 $$;
 
 -- check which entries of tmp_bytea are missing from content_mimetype
 --
 -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
 -- 2. call this function
 create or replace function swh_content_mimetype_missing()
     returns setof sha1
     language plpgsql
 as $$
 begin
     return query
 	(select id::sha1 from tmp_bytea as tmp
 	 where not exists
 	     (select 1 from content_mimetype as c where c.id = tmp.id));
     return;
 end
 $$;
 
 COMMENT ON FUNCTION swh_content_mimetype_missing() IS 'Filter missing content mimetype';
 
 
 -- add tmp_content_mimetype entries to content_mimetype, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_content_mimetype_missing must take place before calling this
 -- function.
 --
 --
 -- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype,
 -- 2. call this function
 create or replace function swh_content_mimetype_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
         insert into content_mimetype (id, mimetype, encoding)
         select id, mimetype, encoding
         from tmp_content_mimetype
             on conflict(id)
                 do update set mimetype = excluded.mimetype,
                     encoding = excluded.encoding;
 
     else
         insert into content_mimetype (id, mimetype, encoding)
         select id, mimetype, encoding
          from tmp_content_mimetype
             on conflict do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
 
 
 -- Retrieve list of content mimetype from the temporary table.
 --
 -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
 -- 2. call this function
 create or replace function swh_content_mimetype_get()
     returns setof content_mimetype
     language plpgsql
 as $$
 begin
     return query
         select id::sha1, mimetype, encoding
         from tmp_bytea t
         inner join content_mimetype using(id);
     return;
 end
 $$;
 
 comment on function swh_content_mimetype_get() IS 'List content mimetypes';
 
 
 -- check which entries of tmp_bytea are missing from content_language
 --
 -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
 -- 2. call this function
 create or replace function swh_content_language_missing()
     returns setof sha1
     language plpgsql
 as $$
 begin
     return query
 	(select id::sha1 from tmp_bytea as tmp
 	 where not exists
 	     (select 1 from content_language as c where c.id = tmp.id));
     return;
 end
 $$;
 
 comment on function swh_content_language_missing() IS 'Filter missing content languages';
 
 -- add tmp_content_language entries to content_language, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_content_language_missing must take place before calling this
 -- function.
 --
 -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
 -- tmp_content_language, 2. call this function
 create or replace function swh_content_language_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
         insert into content_language (id, lang)
         select id, lang
     	from tmp_content_language
             on conflict(id)
                 do update set lang = excluded.lang;
 
     else
         insert into content_language (id, lang)
         select id, lang
     	from tmp_content_language
             on conflict do nothing;
     end if;
     return;
 end
 $$;
 
 comment on function swh_content_language_add(boolean) IS 'Add new content languages';
 
 -- Retrieve list of content language from the temporary table.
 --
 -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
 create or replace function swh_content_language_get()
     returns setof content_language
     language plpgsql
 as $$
 begin
     return query
         select id::sha1, lang
         from tmp_bytea t
         inner join content_language using(id);
     return;
 end
 $$;
 
 comment on function swh_content_language_get() IS 'List content languages';
 
 
 -- add tmp_content_ctags entries to content_ctags, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags,
 -- 2. call this function
 create or replace function swh_content_ctags_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
         delete from content_ctags
         where id in (select distinct id from tmp_content_ctags);
     end if;
 
     insert into content_ctags (id, name, kind, line, lang)
     select id, name, kind, line, lang
     from tmp_content_ctags
         on conflict(id, md5(name), kind, line, lang)
         do nothing;
     return;
 end
 $$;
 
 comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
 
 -- check which entries of tmp_bytea are missing from content_ctags
 --
 -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
 -- 2. call this function
 create or replace function swh_content_ctags_missing()
     returns setof sha1
     language plpgsql
 as $$
 begin
     return query
 	(select id::sha1 from tmp_bytea as tmp
 	 where not exists
 	     (select 1 from content_ctags as c where c.id = tmp.id limit 1));
     return;
 end
 $$;
 
 comment on function swh_content_ctags_missing() IS 'Filter missing content ctags';
 
 create type content_ctags_signature as (
   id sha1,
   name text,
   kind text,
   line bigint,
   lang ctags_languages
 );
 
 -- Retrieve list of content ctags from the temporary table.
 --
 -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
 create or replace function swh_content_ctags_get()
     returns setof content_ctags_signature
     language plpgsql
 as $$
 begin
     return query
         select c.id, c.name, c.kind, c.line, c.lang
         from tmp_bytea t
         inner join content_ctags c using(id)
         order by line;
     return;
 end
 $$;
 
 comment on function swh_content_ctags_get() IS 'List content ctags';
 
 -- Search within ctags content.
 --
 create or replace function swh_content_ctags_search(expression text, l integer, o integer)
     returns setof content_ctags_signature
-    language sql
+    language plpgsql
 as $$
-    select id, name, kind, line, lang
-    from content_ctags
-    where searchable_symbol @@ to_tsquery(expression)
-    order by id
-    limit l
-    offset o;
+begin
+    return query
+        select id, name, kind, line, lang
+        from content_ctags
+        where searchable_symbol @@ to_tsquery(expression)
+        order by id
+        limit l
+        offset o;
+exception
+    when sqlstate '42000' then  -- syntax error
+        raise exception 'Bad syntax in expression ''%''', expression;
+end
 $$;
 
 comment on function swh_content_ctags_search(text, integer, integer) IS 'Search through ctags'' symbols';
 
 
 -- check which entries of tmp_bytea are missing from content_fossology_license
 --
 -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
 -- 2. call this function
 create or replace function swh_content_fossology_license_missing()
     returns setof sha1
     language plpgsql
 as $$
 begin
     return query
 	(select id::sha1 from tmp_bytea as tmp
 	 where not exists
 	     (select 1 from content_fossology_license as c where c.id = tmp.id));
     return;
 end
 $$;
 
 comment on function swh_content_fossology_license_missing() IS 'Filter missing content licenses';
 
 -- add tmp_content_fossology_license entries to content_fossology_license, overwriting
 -- duplicates if conflict_update is true, skipping duplicates otherwise.
 --
 -- If filtering duplicates is in order, the call to
 -- swh_content_fossology_license_missing must take place before calling this
 -- function.
 --
 -- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to
 -- tmp_content_fossology_license, 2. call this function
 create or replace function swh_content_fossology_license_add(conflict_update boolean)
     returns void
     language plpgsql
 as $$
 begin
     if conflict_update then
         delete from content_fossology_license
         where id in (select distinct id from tmp_content_fossology_license);
     end if;
 
     insert into content_fossology_license (id, license_id, indexer_configuration_id)
     select tcl.id,
           (select id from fossology_license where name = tcl.license) as license,
           (select id from indexer_configuration where tool_name = tcl.tool_name
                                                 and tool_version = tcl.tool_version)
                           as indexer_configuration_id
     from tmp_content_fossology_license tcl
         on conflict(id, license_id, indexer_configuration_id)
         do nothing;
     return;
 end
 $$;
 
 comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
 
 create or replace function swh_content_fossology_license_unknown()
     returns setof text
     language plpgsql
 as $$
 begin
     return query
         select name from tmp_content_fossology_license_unknown t where not exists (
             select 1 from fossology_license where name=t.name
         );
 end
 $$;
 
 comment on function swh_content_fossology_license_unknown() IS 'List unknown licenses';
 
 create type content_fossology_license_signature as (
   id           sha1,
   tool_name    text,
   tool_version text,
   licenses     text[]
 );
 
 -- Retrieve list of content license from the temporary table.
 --
 -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
 -- 2. call this function
 create or replace function swh_content_fossology_license_get()
     returns setof content_fossology_license_signature
     language plpgsql
 as $$
 begin
     return query
       select cl.id,
              ic.tool_name,
              ic.tool_version,
              array(select name
                    from fossology_license
                    where id = ANY(array_agg(cl.license_id))) as licenses
       from tmp_bytea tcl
       inner join content_fossology_license cl using(id)
       inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id
       group by cl.id, ic.tool_name, ic.tool_version;
     return;
 end
 $$;
 
 comment on function swh_content_fossology_license_get() IS 'List content licenses';
 
 
 -- simple counter mapping a textual label to an integer value
 create type counter as (
     label  text,
     value  bigint
 );
 
 -- return statistics about the number of tuples in various SWH tables
 --
 -- Note: the returned values are based on postgres internal statistics
 -- (pg_class table), which are only updated daily (by autovacuum) or so
 create or replace function swh_stat_counters()
     returns setof counter
     language sql
     stable
 as $$
     select relname::text as label, reltuples::bigint as value
     from pg_class
     where oid in (
         'public.content'::regclass,
         'public.directory'::regclass,
         'public.directory_entry_dir'::regclass,
         'public.directory_entry_file'::regclass,
         'public.directory_entry_rev'::regclass,
         'public.occurrence'::regclass,
         'public.occurrence_history'::regclass,
         'public.origin'::regclass,
         'public.person'::regclass,
         'public.entity'::regclass,
         'public.entity_history'::regclass,
         'public.release'::regclass,
         'public.revision'::regclass,
         'public.revision_history'::regclass,
         'public.skipped_content'::regclass
     );
 $$;
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
index 0eef2708d..a0a6dd01c 100644
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -1,651 +1,651 @@
 ---
 --- Software Heritage Data Model
 ---
 
 -- drop schema if exists swh cascade;
 -- create schema swh;
 -- set search_path to swh;
 
 create table dbversion
 (
   version     int primary key,
   release     timestamptz,
   description text
 );
 
 insert into dbversion(version, release, description)
-      values(94, now(), 'Work In Progress');
+      values(95, now(), 'Work In Progress');
 
 -- a SHA1 checksum (not necessarily originating from Git)
 create domain sha1 as bytea check (length(value) = 20);
 
 -- a Git object ID, i.e., a SHA1 checksum
 create domain sha1_git as bytea check (length(value) = 20);
 
 -- a SHA256 checksum
 create domain sha256 as bytea check (length(value) = 32);
 
 -- UNIX path (absolute, relative, individual path component, etc.)
 create domain unix_path as bytea;
 
 -- a set of UNIX-like access permissions, as manipulated by, e.g., chmod
 create domain file_perms as int;
 
 -- Checksums about actual file content. Note that the content itself is not
 -- stored in the DB, but on external (key-value) storage. A single checksum is
 -- used as key there, but the other can be used to verify that we do not inject
 -- content collisions not knowingly.
 create table content
 (
   sha1      sha1 primary key,
   sha1_git  sha1_git not null,
   sha256    sha256 not null,
   length    bigint not null,
   ctime     timestamptz not null default now(),
             -- creation time, i.e. time of (first) injection into the storage
   status    content_status not null default 'visible',
   object_id bigserial
 );
 
 create unique index on content(sha1_git);
 create unique index on content(sha256);
 create index on content(ctime);  -- TODO use a BRIN index here (postgres >= 9.5)
 create index on content(object_id);
 
 -- Asynchronous notification of new content insertions
 create function notify_new_content()
   returns trigger
   language plpgsql
 as $$
   begin
     perform pg_notify('new_content', encode(new.sha1, 'hex'));
     return null;
   end;
 $$;
 
 create trigger notify_new_content
   after insert on content
   for each row
   execute procedure notify_new_content();
 
 
 -- Entities constitute a typed hierarchy of organization, hosting
 -- facilities, groups, people and software projects.
 --
 -- Examples of entities: Software Heritage, Debian, GNU, GitHub,
 -- Apache, The Linux Foundation, the Debian Python Modules Team, the
 -- torvalds GitHub user, the torvalds/linux GitHub project.
 --
 -- The data model is hierarchical (via the parent attribute) and might
 -- store sub-branches of existing entities. The key feature of an
 -- entity is might be *listed* (if it is available in listable_entity)
 -- to retrieve information about its content, i.e: sub-entities,
 -- projects, origins.
 
 -- The history of entities. Allows us to keep historical metadata
 -- about entities.  The temporal invariant is the uuid. Root
 -- organization uuids are manually generated (and available in
 -- swh-data.sql).
 --
 -- For generated entities (generated = true), we can provide
 -- generation_metadata to allow listers to retrieve the uuids of previous
 -- iterations of the entity.
 --
 -- Inactive entities that have been active in the past (active =
 -- false) should register the timestamp at which we saw them
 -- deactivate, in a new entry of entity_history.
 create table entity_history
 (
   id               bigserial primary key,
   uuid             uuid,
   parent           uuid,             -- should reference entity_history(uuid)
   name             text not null,
   type             entity_type not null,
   description      text,
   homepage         text,
   active           boolean not null, -- whether the entity was seen on the last listing
   generated        boolean not null, -- whether this entity has been generated by a lister
   lister_metadata  jsonb,            -- lister-specific metadata, used for queries
   metadata         jsonb,
   validity         timestamptz[]     -- timestamps at which we have seen this entity
 );
 
 create index on entity_history(uuid);
 create index on entity_history(name);
 
 -- The entity table provides a view of the latest information on a
 -- given entity. It is updated via a trigger on entity_history.
 create table entity
 (
   uuid             uuid primary key,
   parent           uuid references entity(uuid) deferrable initially deferred,
   name             text not null,
   type             entity_type not null,
   description      text,
   homepage         text,
   active           boolean not null, -- whether the entity was seen on the last listing
   generated        boolean not null, -- whether this entity has been generated by a lister
   lister_metadata  jsonb,            -- lister-specific metadata, used for queries
   metadata         jsonb,
   last_seen        timestamptz,      -- last listing time or disappearance time for active=false
   last_id          bigint references entity_history(id) -- last listing id
 );
 
 create index on entity(name);
 create index on entity using gin(lister_metadata jsonb_path_ops);
 
 -- Register the equivalence between two entities. Allows sideways
 -- navigation in the entity table
 create table entity_equivalence
 (
   entity1 uuid references entity(uuid),
   entity2 uuid references entity(uuid),
   primary key (entity1, entity2),
   constraint order_entities check (entity1 < entity2)
 );
 
 -- Register a lister for a specific entity.
 create table listable_entity
 (
   uuid         uuid references entity(uuid) primary key,
   enabled      boolean not null default true, -- do we list this entity automatically?
   list_engine  text,  -- crawler to be used to list entity's content
   list_url     text,  -- root URL to start the listing
   list_params  jsonb,  -- org-specific listing parameter
   latest_list  timestamptz  -- last time the entity's content has been listed
 );
 
 -- Log of all entity listings (i.e., entity crawling) that have been
 -- done in the past, or are still ongoing.
 create table list_history
 (
   id        bigserial primary key,
   entity    uuid references listable_entity(uuid),
   date      timestamptz not null,
   status    boolean,  -- true if and only if the listing has been successful
   result    jsonb,     -- more detailed return value, depending on status
   stdout    text,
   stderr    text,
   duration  interval  -- fetch duration of NULL if still ongoing
 );
 
 
 -- An origin is a place, identified by an URL, where software can be found. We
 -- support different kinds of origins, e.g., git and other VCS repositories,
 -- web pages that list tarballs URLs (e.g., http://www.kernel.org), indirect
 -- tarball URLs (e.g., http://www.example.org/latest.tar.gz), etc. The key
 -- feature of an origin is that it can be *fetched* (wget, git clone, svn
 -- checkout, etc.) to retrieve all the contained software.
 create table origin
 (
   id       bigserial primary key,
   type     text, -- TODO use an enum here (?)
   url      text not null,
   lister   uuid references listable_entity(uuid),
   project  uuid references entity(uuid)
 );
 
 create index on origin(type, url);
 
 -- Asynchronous notification of new origin insertions
 create function notify_new_origin()
   returns trigger
   language plpgsql
 as $$
   begin
     perform pg_notify('new_origin', new.id::text);
     return null;
   end;
 $$;
 
 create trigger notify_new_origin
   after insert on origin
   for each row
   execute procedure notify_new_origin();
 
 -- Content we have seen but skipped for some reason. This table is
 -- separate from the content table as we might not have the sha1
 -- checksum of that data (for instance when we inject git
 -- repositories, objects that are too big will be skipped here, and we
 -- will only know their sha1_git). 'reason' contains the reason the
 -- content was skipped. origin is a nullable column allowing to find
 -- out which origin contains that skipped content.
 create table skipped_content
 (
   sha1      sha1,
   sha1_git  sha1_git,
   sha256    sha256,
   length    bigint not null,
   ctime     timestamptz not null default now(),
   status    content_status not null default 'absent',
   reason    text not null,
   origin    bigint references origin(id),
   object_id bigserial,
   unique (sha1, sha1_git, sha256)
 );
 
 -- Those indexes support multiple NULL values.
 create unique index on skipped_content(sha1);
 create unique index on skipped_content(sha1_git);
 create unique index on skipped_content(sha256);
 create index on skipped_content(object_id);
 
 -- Asynchronous notification of new skipped content insertions
 create function notify_new_skipped_content()
   returns trigger
   language plpgsql
 as $$
   begin
   perform pg_notify('new_skipped_content', json_build_object(
       'sha1', encode(new.sha1, 'hex'),
       'sha1_git', encode(new.sha1_git, 'hex'),
       'sha256', encode(new.sha256, 'hex')
     )::text);
     return null;
   end;
 $$;
 
 create trigger notify_new_skipped_content
   after insert on skipped_content
   for each row
   execute procedure notify_new_skipped_content();
 
 
 -- Log of all origin fetches (i.e., origin crawling) that have been done in the
 -- past, or are still ongoing. Similar to list_history, but for origins.
 create table fetch_history
 (
   id        bigserial primary key,
   origin    bigint references origin(id),
   date      timestamptz not null,
   status    boolean,  -- true if and only if the fetch has been successful
   result    jsonb,     -- more detailed returned values, times, etc...
   stdout    text,
   stderr    text,     -- null when status is true, filled otherwise
   duration  interval  -- fetch duration of NULL if still ongoing
 );
 
 
 -- A file-system directory.  A directory is a list of directory entries (see
 -- tables: directory_entry_{dir,file}).
 --
 -- To list the contents of a directory:
 -- 1. list the contained directory_entry_dir using array dir_entries
 -- 2. list the contained directory_entry_file using array file_entries
 -- 3. list the contained directory_entry_rev using array rev_entries
 -- 4. UNION
 --
 -- Synonyms/mappings:
 -- * git: tree
 create table directory
 (
   id            sha1_git primary key,
   dir_entries   bigint[],  -- sub-directories, reference directory_entry_dir
   file_entries  bigint[],  -- contained files, reference directory_entry_file
   rev_entries   bigint[],  -- mounted revisions, reference directory_entry_rev
   object_id     bigserial  -- short object identifier
 );
 
 create index on directory using gin (dir_entries);
 create index on directory using gin (file_entries);
 create index on directory using gin (rev_entries);
 create index on directory(object_id);
 
 -- Asynchronous notification of new directory insertions
 create function notify_new_directory()
   returns trigger
   language plpgsql
 as $$
   begin
     perform pg_notify('new_directory', encode(new.id, 'hex'));
     return null;
   end;
 $$;
 
 create trigger notify_new_directory
   after insert on directory
   for each row
   execute procedure notify_new_directory();
 
 
 -- A directory entry pointing to a sub-directory.
 create table directory_entry_dir
 (
   id      bigserial primary key,
   target  sha1_git,   -- id of target directory
   name    unix_path,  -- path name, relative to containing dir
   perms   file_perms  -- unix-like permissions
 );
 
 create unique index on directory_entry_dir(target, name, perms);
 
 -- A directory entry pointing to a file.
 create table directory_entry_file
 (
   id      bigserial primary key,
   target  sha1_git,   -- id of target file
   name    unix_path,  -- path name, relative to containing dir
   perms   file_perms  -- unix-like permissions
 );
 
 create unique index on directory_entry_file(target, name, perms);
 
 -- A directory entry pointing to a revision.
 create table directory_entry_rev
 (
   id      bigserial primary key,
   target  sha1_git,   -- id of target revision
   name    unix_path,  -- path name, relative to containing dir
   perms   file_perms  -- unix-like permissions
 );
 
 create unique index on directory_entry_rev(target, name, perms);
 
 create table person
 (
   id        bigserial primary key,
   fullname  bytea not null, -- freeform specification; what is actually used in the checksums
                             --     will usually be of the form 'name <email>'
   name      bytea,          -- advisory: not null if we managed to parse a name
   email     bytea           -- advisory: not null if we managed to parse an email
 );
 
 create unique index on person(fullname);
 create index on person(name);
 create index on person(email);
 
 -- A snapshot of a software project at a specific point in time.
 --
 -- Synonyms/mappings:
 -- * git / subversion / etc: commit
 -- * tarball: a specific tarball
 --
 -- Revisions are organized as DAGs. Each revision points to 0, 1, or more (in
 -- case of merges) parent revisions. Each revision points to a directory, i.e.,
 -- a file-system tree containing files and directories.
 create table revision
 (
   id                    sha1_git primary key,
   date                  timestamptz,
   date_offset           smallint,
   date_neg_utc_offset   boolean,
   committer_date        timestamptz,
   committer_date_offset smallint,
   committer_date_neg_utc_offset boolean,
   type                  revision_type not null,
   directory             sha1_git,  -- file-system tree
   message               bytea,
   author                bigint references person(id),
   committer             bigint references person(id),
   metadata              jsonb, -- extra metadata (tarball checksums, extra commit information, etc...)
   synthetic             boolean not null default false,  -- true if synthetic (cf. swh-loader-tar)
   object_id             bigserial
 );
 
 create index on revision(directory);
 create index on revision(object_id);
 
 -- Asynchronous notification of new revision insertions
 create function notify_new_revision()
   returns trigger
   language plpgsql
 as $$
   begin
     perform pg_notify('new_revision', encode(new.id, 'hex'));
     return null;
   end;
 $$;
 
 create trigger notify_new_revision
   after insert on revision
   for each row
   execute procedure notify_new_revision();
 
 
 -- either this table or the sha1_git[] column on the revision table
 create table revision_history
 (
   id           sha1_git references revision(id),
   parent_id    sha1_git,
   parent_rank  int not null default 0,
     -- parent position in merge commits, 0-based
   primary key (id, parent_rank)
 );
 
 create index on revision_history(parent_id);
 
 -- The timestamps at which Software Heritage has made a visit of the given origin.
 create table origin_visit
 (
   origin    bigint not null references origin(id),
   visit     bigint not null,
   date      timestamptz not null,
   status    origin_visit_status not null,
   metadata  jsonb,
   primary key (origin, visit)
 );
 
 comment on column origin_visit.origin is 'Visited origin';
 comment on column origin_visit.visit is 'Visit number the visit occurred for that origin';
 comment on column origin_visit.date is 'Visit date for that origin';
 comment on column origin_visit.status is 'Visit status for that origin';
 comment on column origin_visit.metadata is 'Metadata associated with the visit';
 
 create index on origin_visit(date);
 
 -- Asynchronous notification of new origin visits
 create function notify_new_origin_visit()
   returns trigger
   language plpgsql
 as $$
   begin
     perform pg_notify('new_origin_visit', json_build_object(
       'origin', new.origin,
       'visit', new.visit
     )::text);
     return null;
   end;
 $$;
 
 create trigger notify_new_origin_visit
   after insert on origin_visit
   for each row
   execute procedure notify_new_origin_visit();
 
 
 -- The content of software origins is indexed starting from top-level pointers
 -- called "branches". Every time we fetch some origin we store in this table
 -- where the branches pointed to at fetch time.
 --
 -- Synonyms/mappings:
 -- * git: ref (in the "git update-ref" sense)
 create table occurrence_history
 (
   origin       bigint references origin(id) not null,
   branch       bytea not null,        -- e.g., b"master" (for VCS), or b"sid" (for Debian)
   target       sha1_git not null,     -- ref target, e.g., commit id
   target_type  object_type not null,  -- ref target type
   object_id    bigserial not null,    -- short object identifier
   visits       bigint[] not null,     -- the visits where that occurrence was valid. References
                                       -- origin_visit(visit), where o_h.origin = origin_visit.origin.
   primary key (object_id)
 );
 
 create index on occurrence_history(target, target_type);
 create index on occurrence_history(origin, branch);
 create unique index on occurrence_history(origin, branch, target, target_type);
 create index on occurrence_history(object_id);
 
 -- Materialized view of occurrence_history, storing the *current* value of each
 -- branch, as last seen by SWH.
 create table occurrence
 (
   origin    bigint references origin(id) not null,
   branch    bytea not null,
   target    sha1_git not null,
   target_type object_type not null,
   primary key(origin, branch)
 );
 
 -- A "memorable" point in the development history of a project.
 --
 -- Synonyms/mappings:
 -- * git: tag (of the annotated kind, otherwise they are just references)
 -- * tarball: the release version number
 create table release
 (
   id          sha1_git primary key,
   target      sha1_git,
   target_type object_type,
   date        timestamptz,
   date_offset smallint,
   date_neg_utc_offset  boolean,
   name        bytea,
   comment     bytea,
   author      bigint references person(id),
   synthetic   boolean not null default false,  -- true if synthetic (cf. swh-loader-tar)
   object_id   bigserial
 );
 
 create index on release(target, target_type);
 create index on release(object_id);
 
 -- Asynchronous notification of new release insertions
 create function notify_new_release()
   returns trigger
   language plpgsql
 as $$
   begin
     perform pg_notify('new_release', encode(new.id, 'hex'));
     return null;
   end;
 $$;
 
 create trigger notify_new_release
   after insert on release
   for each row
   execute procedure notify_new_release();
 
 
 -- Content provenance information caches
 -- https://forge.softwareheritage.org/T547
 --
 -- Those tables aren't expected to be exhaustive, and get filled on a case by
 -- case basis: absence of data doesn't mean the data is not there
 
 -- content <-> revision mapping cache
 --
 -- semantics: "we have seen the content with given id in the given path inside
 -- the given revision"
 
 create table cache_content_revision (
     content         sha1_git not null primary key references content(sha1_git),
     blacklisted     boolean default false,
     revision_paths  bytea[][]
 );
 
 create table cache_content_revision_processed (
     revision  sha1_git not null primary key references revision(id)
 );
 
 -- revision <-> origin_visit mapping cache
 --
 -- semantics: "we have seen the given revision in the given origin during the
 -- given visit"
 
 create table cache_revision_origin (
    revision  sha1_git not null references revision(id),
    origin    bigint not null,
    visit     bigint not null,
    primary key (revision, origin, visit),
    foreign key (origin, visit) references origin_visit (origin, visit)
 );
 
 create index on cache_revision_origin(revision);
 
 -- Computing metadata on sha1's contents
 
 -- Properties (mimetype, encoding, etc...)
 create table content_mimetype (
   id sha1 primary key references content(sha1) not null,
   mimetype bytea not null,
   encoding bytea not null
 );
 
 comment on table content_mimetype is 'Metadata associated to a raw content';
 comment on column content_mimetype.mimetype is 'Raw content Mimetype';
 comment on column content_mimetype.encoding is 'Raw content encoding';
 
 -- Language metadata
 create table content_language (
   id sha1 primary key references content(sha1) not null,
   lang languages not null
 );
 
 comment on table content_language is 'Language information on a raw content';
 comment on column content_language.lang is 'Language information';
 
 -- ctags information per content
 create table content_ctags (
   id sha1 references content(sha1) not null,
   name text not null,
   kind text not null,
   line bigint not null,
   lang ctags_languages not null,
   searchable_symbol tsvector
 );
 
 comment on table content_ctags is 'Ctags information on a raw content';
 comment on column content_ctags.id is 'Content identifier';
 comment on column content_ctags.name is 'Symbol name';
 comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)';
 comment on column content_ctags.line is 'Symbol line';
 comment on column content_ctags.lang is 'Language information for that content';
 comment on column content_ctags.searchable_symbol is 'Searchable symbol derived from name column';
 
 create index on content_ctags(id);
 create unique index on content_ctags(id, md5(name), kind, line, lang);
 
 create trigger content_ctags_tsvectorupdate before insert or update
 on content_ctags for each row execute procedure
 tsvector_update_trigger(searchable_symbol, 'pg_catalog.english', name);
 
 create index searchable_symbol_idx ON content_ctags USING GIN (searchable_symbol);
 
 create table fossology_license(
   id smallserial primary key,
   name text not null
 );
 
 comment on table fossology_license is 'Possible license recognized by license indexer';
 comment on column fossology_license.id is 'License identifier';
 comment on column fossology_license.name is 'License name';
 
 create unique index on fossology_license(name);
 
 create table indexer_configuration (
   id serial primary key not null,
   tool_name text not null,
   tool_version text not null,
   tool_configuration jsonb
 );
 
 comment on table indexer_configuration is 'Indexer''s configuration version';
 comment on column indexer_configuration.id is 'Tool identifier';
 comment on column indexer_configuration.tool_version is 'Tool name';
 comment on column indexer_configuration.tool_version is 'Tool version';
 comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...';
 
 create unique index on indexer_configuration(tool_name, tool_version);
 
 create table content_fossology_license (
   id sha1 references content(sha1) not null,
   license_id smallserial references fossology_license(id) not null,
   indexer_configuration_id bigserial references indexer_configuration(id) not null
 );
 
 create unique index on content_fossology_license(id, license_id, indexer_configuration_id);
 
 comment on table content_fossology_license is 'license associated to a raw content';
 comment on column content_fossology_license.id is 'Raw content identifier';
 comment on column content_fossology_license.license_id is 'One of the content''s license identifier';
diff --git a/sql/upgrades/095.sql b/sql/upgrades/095.sql
new file mode 100644
index 000000000..676e8736a
--- /dev/null
+++ b/sql/upgrades/095.sql
@@ -0,0 +1,25 @@
+-- SWH DB schema upgrade
+-- from_version: 94
+-- to_version: 95
+-- description: Fix edge case on content_ctags search function
+
+insert into dbversion(version, release, description)
+      values(95, now(), 'Work In Progress');
+
+create or replace function swh_content_ctags_search(expression text, l integer, o integer)
+    returns setof content_ctags_signature
+    language plpgsql
+as $$
+begin
+    return query
+        select id, name, kind, line, lang
+        from content_ctags
+        where searchable_symbol @@ to_tsquery(expression)
+        order by id
+        limit l
+        offset o;
+exception
+    when sqlstate '42000' then  -- syntax error
+        raise exception 'Bad syntax in expression ''%''', expression;
+end
+$$;
diff --git a/swh/storage/db.py b/swh/storage/db.py
index 0d5e06c4e..21c7d55dd 100644
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -1,919 +1,927 @@
 # Copyright (C) 2015-2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import binascii
 import datetime
 import functools
 import json
 import psycopg2
 import psycopg2.extras
 import select
 import tempfile
 
 from contextlib import contextmanager
 
+from .exc import BadSyntaxAPIError
+
+
 TMP_CONTENT_TABLE = 'tmp_content'
 
 
 psycopg2.extras.register_uuid()
 
 
 def stored_procedure(stored_proc):
     """decorator to execute remote stored procedure, specified as argument
 
     Generally, the body of the decorated function should be empty. If it is
     not, the stored procedure will be executed first; the function body then.
 
     """
     def wrap(meth):
         @functools.wraps(meth)
         def _meth(self, *args, **kwargs):
             cur = kwargs.get('cur', None)
             self._cursor(cur).execute('SELECT %s()' % stored_proc)
             meth(self, *args, **kwargs)
         return _meth
     return wrap
 
 
 def jsonize(value):
     """Convert a value to a psycopg2 JSON object if necessary"""
     if isinstance(value, dict):
         return psycopg2.extras.Json(value)
 
     return value
 
 
 def entry_to_bytes(entry):
     """Convert an entry coming from the database to bytes"""
     if isinstance(entry, memoryview):
         return entry.tobytes()
     if isinstance(entry, list):
         return [entry_to_bytes(value) for value in entry]
     return entry
 
 
 def line_to_bytes(line):
     """Convert a line coming from the database to bytes"""
     if not line:
         return line
     if isinstance(line, dict):
         return {k: entry_to_bytes(v) for k, v in line.items()}
     return line.__class__(entry_to_bytes(entry) for entry in line)
 
 
 def cursor_to_bytes(cursor):
     """Yield all the data from a cursor as bytes"""
     yield from (line_to_bytes(line) for line in cursor)
 
 
 class BaseDb:
     """Base class for swh.storage.*Db.
 
     cf. swh.storage.db.Db, swh.storage.archiver.db.ArchiverDb
 
     """
 
     @classmethod
     def connect(cls, *args, **kwargs):
         """factory method to create a DB proxy
 
         Accepts all arguments of psycopg2.connect; only some specific
         possibilities are reported below.
 
         Args:
             connstring: libpq2 connection string
 
         """
         conn = psycopg2.connect(*args, **kwargs)
         return cls(conn)
 
     def _cursor(self, cur_arg):
         """get a cursor: from cur_arg if given, or a fresh one otherwise
 
         meant to avoid boilerplate if/then/else in methods that proxy stored
         procedures
 
         """
         if cur_arg is not None:
             return cur_arg
         # elif self.cur is not None:
         #     return self.cur
         else:
             return self.conn.cursor()
 
     def __init__(self, conn):
         """create a DB proxy
 
         Args:
             conn: psycopg2 connection to the SWH DB
 
         """
         self.conn = conn
 
     @contextmanager
     def transaction(self):
         """context manager to execute within a DB transaction
 
         Yields:
             a psycopg2 cursor
 
         """
         with self.conn.cursor() as cur:
             try:
                 yield cur
                 self.conn.commit()
             except:
                 if not self.conn.closed:
                     self.conn.rollback()
                 raise
 
     def copy_to(self, items, tblname, columns, cur=None, item_cb=None):
         """Copy items' entries to table tblname with columns information.
 
         Args:
             items (dict): dictionary of data to copy over tblname
             tblname (str): Destination table's name
             columns ([str]): keys to access data in items and also the
               column names in the destination table.
             item_cb (fn): optional function to apply to items's entry
 
         """
         def escape(data):
             if data is None:
                 return ''
             if isinstance(data, bytes):
                 return '\\x%s' % binascii.hexlify(data).decode('ascii')
             elif isinstance(data, str):
                 return '"%s"' % data.replace('"', '""')
             elif isinstance(data, datetime.datetime):
                 # We escape twice to make sure the string generated by
                 # isoformat gets escaped
                 return escape(data.isoformat())
             elif isinstance(data, dict):
                 return escape(json.dumps(data))
             elif isinstance(data, list):
                 return escape("{%s}" % ','.join(escape(d) for d in data))
             elif isinstance(data, psycopg2.extras.Range):
                 # We escape twice here too, so that we make sure
                 # everything gets passed to copy properly
                 return escape(
                     '%s%s,%s%s' % (
                         '[' if data.lower_inc else '(',
                         '-infinity' if data.lower_inf else escape(data.lower),
                         'infinity' if data.upper_inf else escape(data.upper),
                         ']' if data.upper_inc else ')',
                     )
                 )
             else:
                 # We don't escape here to make sure we pass literals properly
                 return str(data)
         with tempfile.TemporaryFile('w+') as f:
             for d in items:
                 if item_cb is not None:
                     item_cb(d)
                 line = [escape(d.get(k)) for k in columns]
                 f.write(','.join(line))
                 f.write('\n')
             f.seek(0)
             self._cursor(cur).copy_expert('COPY %s (%s) FROM STDIN CSV' % (
                 tblname, ', '.join(columns)), f)
 
 
 class Db(BaseDb):
     """Proxy to the SWH DB, with wrappers around stored procedures
 
     """
     def mktemp(self, tblname, cur=None):
         self._cursor(cur).execute('SELECT swh_mktemp(%s)', (tblname,))
 
     def mktemp_dir_entry(self, entry_type, cur=None):
         self._cursor(cur).execute('SELECT swh_mktemp_dir_entry(%s)',
                                   (('directory_entry_%s' % entry_type),))
 
     @stored_procedure('swh_mktemp_revision')
     def mktemp_revision(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_release')
     def mktemp_release(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_occurrence_history')
     def mktemp_occurrence_history(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_entity_lister')
     def mktemp_entity_lister(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_entity_history')
     def mktemp_entity_history(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_bytea')
     def mktemp_bytea(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_content_fossology_license')
     def mktemp_content_fossology_license(self, cur=None): pass
 
     @stored_procedure('swh_mktemp_content_fossology_license_unknown')
     def mktemp_content_fossology_license_unknown(self, cur=None): pass
 
     def register_listener(self, notify_queue, cur=None):
         """Register a listener for NOTIFY queue `notify_queue`"""
         self._cursor(cur).execute("LISTEN %s" % notify_queue)
 
     def listen_notifies(self, timeout):
         """Listen to notifications for `timeout` seconds"""
         if select.select([self.conn], [], [], timeout) == ([], [], []):
             return
         else:
             self.conn.poll()
             while self.conn.notifies:
                 yield self.conn.notifies.pop(0)
 
     @stored_procedure('swh_content_add')
     def content_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_directory_add')
     def directory_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_skipped_content_add')
     def skipped_content_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_revision_add')
     def revision_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_release_add')
     def release_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_occurrence_history_add')
     def occurrence_history_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_entity_history_add')
     def entity_history_add_from_temp(self, cur=None): pass
 
     @stored_procedure('swh_cache_content_revision_add')
     def cache_content_revision_add(self, cur=None): pass
 
     def store_tmp_bytea(self, ids, cur=None):
         """Store the given identifiers in a new tmp_bytea table"""
         cur = self._cursor(cur)
 
         self.mktemp_bytea(cur)
         self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea',
                      ['id'], cur)
 
     content_get_metadata_keys = ['sha1', 'sha1_git', 'sha256', 'length',
                                  'status']
 
     def content_get_metadata_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute("""select t.id as sha1, %s from tmp_bytea t
                        left join content on t.id = content.sha1
                     """ % ', '.join(self.content_get_metadata_keys[1:]))
 
         yield from cursor_to_bytes(cur)
 
     def content_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute("""SELECT sha1, sha1_git, sha256
                        FROM swh_content_missing()""")
 
         yield from cursor_to_bytes(cur)
 
     def content_missing_per_sha1_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute("""SELECT *
                        FROM swh_content_missing_per_sha1()""")
 
         yield from cursor_to_bytes(cur)
 
     def skipped_content_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute("""SELECT sha1, sha1_git, sha256
                        FROM swh_skipped_content_missing()""")
 
         yield from cursor_to_bytes(cur)
 
     def occurrence_get(self, origin_id, cur=None):
         """Retrieve latest occurrence's information by origin_id.
 
         """
         cur = self._cursor(cur)
 
         cur.execute("""SELECT origin, branch, target, target_type,
                               (select max(date) from origin_visit
                                where origin=%s) as date
                        FROM occurrence
                        WHERE origin=%s
                     """,
                     (origin_id, origin_id))
 
         yield from cursor_to_bytes(cur)
 
     def content_find(self, sha1=None, sha1_git=None, sha256=None, cur=None):
         """Find the content optionally on a combination of the following
         checksums sha1, sha1_git or sha256.
 
         Args:
             sha1: sha1 content
             git_sha1: the sha1 computed `a la git` sha1 of the content
             sha256: sha256 content
 
         Returns:
             The triplet (sha1, sha1_git, sha256) if found or None.
 
         """
         cur = self._cursor(cur)
 
         cur.execute("""SELECT sha1, sha1_git, sha256, length, ctime, status
                        FROM swh_content_find(%s, %s, %s)
                        LIMIT 1""", (sha1, sha1_git, sha256))
 
         content = line_to_bytes(cur.fetchone())
         if set(content) == {None}:
             return None
         else:
             return content
 
     provenance_cols = ['content', 'revision', 'origin', 'visit', 'path']
 
     def content_find_provenance(self, sha1_git, cur=None):
         """Find content's provenance information
 
         Args:
             sha1: sha1_git content
             cur: cursor to use
 
         Returns:
             Provenance information on such content
 
         """
         cur = self._cursor(cur)
 
         cur.execute("""SELECT content, revision, origin, visit, path
                        FROM swh_content_find_provenance(%s)""",
                     (sha1_git, ))
 
         yield from cursor_to_bytes(cur)
 
     def directory_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('''SELECT id, file_entries, dir_entries, rev_entries
                        FROM swh_directory_get()''')
         yield from cursor_to_bytes(cur)
 
     def directory_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_directory_missing()')
         yield from cursor_to_bytes(cur)
 
     directory_ls_cols = ['dir_id', 'type', 'target', 'name', 'perms',
                          'status', 'sha1', 'sha1_git', 'sha256']
 
     def directory_walk_one(self, directory, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_directory_walk_one(%s)', (directory,))
         yield from cursor_to_bytes(cur)
 
     def directory_walk(self, directory, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_directory_walk(%s)', (directory,))
         yield from cursor_to_bytes(cur)
 
     def revision_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
 
         cur.execute('SELECT id FROM swh_revision_missing() as r(id)')
 
         yield from cursor_to_bytes(cur)
 
     revision_add_cols = [
         'id', 'date', 'date_offset', 'date_neg_utc_offset', 'committer_date',
         'committer_date_offset', 'committer_date_neg_utc_offset', 'type',
         'directory', 'message', 'author_fullname', 'author_name',
         'author_email', 'committer_fullname', 'committer_name',
         'committer_email', 'metadata', 'synthetic',
     ]
 
     revision_get_cols = revision_add_cols + [
         'author_id', 'committer_id', 'parents']
 
     def origin_visit_add(self, origin, ts, cur=None):
         """Add a new origin_visit for origin origin at timestamp ts with
         status 'ongoing'.
 
         Args:
             origin: origin concerned by the visit
             ts: the date of the visit
 
         Returns:
             The new visit index step for that origin
 
         """
         cur = self._cursor(cur)
         self._cursor(cur).execute('SELECT swh_origin_visit_add(%s, %s)',
                                   (origin, ts))
         return cur.fetchone()[0]
 
     def origin_visit_update(self, origin, visit_id, status,
                             metadata, cur=None):
         """Update origin_visit's status."""
         cur = self._cursor(cur)
         update = """UPDATE origin_visit
                     SET status=%s, metadata=%s
                     WHERE origin=%s AND visit=%s"""
         cur.execute(update, (status, jsonize(metadata), origin, visit_id))
 
     origin_visit_get_cols = ['origin', 'visit', 'date', 'status', 'metadata']
 
     def origin_visit_get_all(self, origin_id, cur=None):
         """Retrieve all visits for origin with id origin_id.
 
         Args:
             origin_id: The occurrence's origin
 
         Yields:
             The occurrence's history visits
 
         """
         cur = self._cursor(cur)
 
         query = """\
         SELECT %s
         FROM origin_visit
         WHERE origin=%%s""" % (', '.join(self.origin_visit_get_cols))
 
         cur.execute(query, (origin_id, ))
 
         yield from cursor_to_bytes(cur)
 
     def origin_visit_get(self, origin_id, visit_id, cur=None):
         """Retrieve information on visit visit_id of origin origin_id.
 
         Args:
             origin_id: the origin concerned
             visit_id: The visit step for that origin
 
         Returns:
             The origin_visit information
 
         """
         cur = self._cursor(cur)
 
         query = """\
             SELECT %s
             FROM origin_visit
             WHERE origin = %%s AND visit = %%s
             """ % (', '.join(self.origin_visit_get_cols))
 
         cur.execute(query, (origin_id, visit_id))
         r = cur.fetchall()
         if not r:
             return None
         return line_to_bytes(r[0])
 
     occurrence_cols = ['origin', 'branch', 'target', 'target_type']
 
     def occurrence_by_origin_visit(self, origin_id, visit_id, cur=None):
         """Retrieve all occurrences for a particular origin_visit.
 
         Args:
             origin_id: the origin concerned
             visit_id: The visit step for that origin
 
         Yields:
             The occurrence's history visits
 
         """
         cur = self._cursor(cur)
 
         query = """\
             SELECT %s
             FROM swh_occurrence_by_origin_visit(%%s, %%s)
             """ % (', '.join(self.occurrence_cols))
 
         cur.execute(query, (origin_id, visit_id))
         yield from cursor_to_bytes(cur)
 
     def revision_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = 'SELECT %s FROM swh_revision_get()' % (
             ', '.join(self.revision_get_cols))
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def revision_log(self, root_revisions, limit=None, cur=None):
         cur = self._cursor(cur)
 
         query = """SELECT %s
                    FROM swh_revision_log(%%s, %%s)
                 """ % ', '.join(self.revision_get_cols)
 
         cur.execute(query, (root_revisions, limit))
         yield from cursor_to_bytes(cur)
 
     revision_shortlog_cols = ['id', 'parents']
 
     def revision_shortlog(self, root_revisions, limit=None, cur=None):
         cur = self._cursor(cur)
 
         query = """SELECT %s
                    FROM swh_revision_list(%%s, %%s)
                 """ % ', '.join(self.revision_shortlog_cols)
 
         cur.execute(query, (root_revisions, limit))
         yield from cursor_to_bytes(cur)
 
     cache_content_get_cols = [
         'sha1', 'sha1_git', 'sha256', 'revision_paths']
 
     def cache_content_get_all(self, cur=None):
         """Retrieve cache contents' sha1, sha256, sha1_git
 
         """
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_cache_content_get_all()')
         yield from cursor_to_bytes(cur)
 
     def cache_content_get(self, sha1_git, cur=None):
         """Retrieve cache content information sh.
 
         """
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_cache_content_get(%s)', (sha1_git, ))
         data = cur.fetchone()
         if data:
             return line_to_bytes(data)
         return None
 
     def cache_revision_origin_add(self, origin, visit, cur=None):
         """Populate the content provenance information cache for the given
            (origin, visit) couple."""
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_cache_revision_origin_add(%s, %s)',
                     (origin, visit))
         yield from cursor_to_bytes(cur)
 
     def release_missing_from_temp(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT id FROM swh_release_missing() as r(id)')
         yield from cursor_to_bytes(cur)
 
     object_find_by_sha1_git_cols = ['sha1_git', 'type', 'id', 'object_id']
 
     def object_find_by_sha1_git(self, ids, cur=None):
         cur = self._cursor(cur)
 
         self.store_tmp_bytea(ids, cur)
         query = 'select %s from swh_object_find_by_sha1_git()' % (
             ', '.join(self.object_find_by_sha1_git_cols)
         )
         cur.execute(query)
 
         yield from cursor_to_bytes(cur)
 
     def stat_counters(self, cur=None):
         cur = self._cursor(cur)
         cur.execute('SELECT * FROM swh_stat_counters()')
         yield from cur
 
     fetch_history_cols = ['origin', 'date', 'status', 'result', 'stdout',
                           'stderr', 'duration']
 
     def create_fetch_history(self, fetch_history, cur=None):
         """Create a fetch_history entry with the data in fetch_history"""
         cur = self._cursor(cur)
         query = '''INSERT INTO fetch_history (%s)
                    VALUES (%s) RETURNING id''' % (
             ','.join(self.fetch_history_cols),
             ','.join(['%s'] * len(self.fetch_history_cols))
         )
         cur.execute(query, [fetch_history.get(col) for col in
                             self.fetch_history_cols])
 
         return cur.fetchone()[0]
 
     def get_fetch_history(self, fetch_history_id, cur=None):
         """Get a fetch_history entry with the given id"""
         cur = self._cursor(cur)
         query = '''SELECT %s FROM fetch_history WHERE id=%%s''' % (
             ', '.join(self.fetch_history_cols),
         )
         cur.execute(query, (fetch_history_id,))
 
         data = cur.fetchone()
 
         if not data:
             return None
 
         ret = {'id': fetch_history_id}
         for i, col in enumerate(self.fetch_history_cols):
             ret[col] = data[i]
 
         return ret
 
     def update_fetch_history(self, fetch_history, cur=None):
         """Update the fetch_history entry from the data in fetch_history"""
         cur = self._cursor(cur)
         query = '''UPDATE fetch_history
                    SET %s
                    WHERE id=%%s''' % (
             ','.join('%s=%%s' % col for col in self.fetch_history_cols)
         )
         cur.execute(query, [jsonize(fetch_history.get(col)) for col in
                             self.fetch_history_cols + ['id']])
 
     base_entity_cols = ['uuid', 'parent', 'name', 'type',
                         'description', 'homepage', 'active',
                         'generated', 'lister_metadata',
                         'metadata']
 
     entity_cols = base_entity_cols + ['last_seen', 'last_id']
     entity_history_cols = base_entity_cols + ['id', 'validity']
 
     def origin_add(self, type, url, cur=None):
         """Insert a new origin and return the new identifier."""
         insert = """INSERT INTO origin (type, url) values (%s, %s)
                     RETURNING id"""
 
         cur.execute(insert, (type, url))
         return cur.fetchone()[0]
 
     def origin_get_with(self, type, url, cur=None):
         """Retrieve the origin id from its type and url if found."""
         cur = self._cursor(cur)
 
         query = """SELECT id, type, url, lister, project
                    FROM origin
                    WHERE type=%s AND url=%s"""
 
         cur.execute(query, (type, url))
         data = cur.fetchone()
         if data:
             return line_to_bytes(data)
         return None
 
     def origin_get(self, id, cur=None):
         """Retrieve the origin per its identifier.
 
         """
         cur = self._cursor(cur)
 
         query = "SELECT id, type, url, lister, project FROM origin WHERE id=%s"
 
         cur.execute(query, (id,))
         data = cur.fetchone()
         if data:
             return line_to_bytes(data)
         return None
 
     person_cols = ['fullname', 'name', 'email']
     person_get_cols = person_cols + ['id']
 
     def person_add(self, person, cur=None):
         """Add a person identified by its name and email.
 
         Returns:
             The new person's id
 
         """
         cur = self._cursor(cur)
 
         query_new_person = '''\
         INSERT INTO person(%s)
         VALUES (%s)
         RETURNING id''' % (
             ', '.join(self.person_cols),
             ', '.join('%s' for i in range(len(self.person_cols)))
         )
         cur.execute(query_new_person,
                     [person[col] for col in self.person_cols])
         return cur.fetchone()[0]
 
     def person_get(self, ids, cur=None):
         """Retrieve the persons identified by the list of ids.
 
         """
         cur = self._cursor(cur)
 
         query = """SELECT %s
                    FROM person
                    WHERE id IN %%s""" % ', '.join(self.person_get_cols)
 
         cur.execute(query, (tuple(ids),))
         yield from cursor_to_bytes(cur)
 
     release_add_cols = [
         'id', 'target', 'target_type', 'date', 'date_offset',
         'date_neg_utc_offset', 'name', 'comment', 'synthetic',
         'author_fullname', 'author_name', 'author_email',
     ]
     release_get_cols = release_add_cols + ['author_id']
 
     def release_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = '''
         SELECT %s
             FROM swh_release_get()
         ''' % ', '.join(self.release_get_cols)
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def release_get_by(self,
                        origin_id,
                        limit=None,
                        cur=None):
         """Retrieve a release by occurrence criterion (only origin right now)
 
         Args:
             - origin_id: The origin to look for.
 
         """
         cur = self._cursor(cur)
         query = """
         SELECT %s
             FROM swh_release_get_by(%%s)
             LIMIT %%s
         """ % ', '.join(self.release_get_cols)
         cur.execute(query, (origin_id, limit))
         yield from cursor_to_bytes(cur)
 
     def revision_get_by(self,
                         origin_id,
                         branch_name,
                         datetime,
                         limit=None,
                         cur=None):
         """Retrieve a revision by occurrence criterion.
 
         Args:
             - origin_id: The origin to look for
             - branch_name: the branch name to look for
             - datetime: the lower bound of timerange to look for.
             - limit: limit number of results to return
             The upper bound being now.
         """
         cur = self._cursor(cur)
         if branch_name and isinstance(branch_name, str):
             branch_name = branch_name.encode('utf-8')
 
         query = '''
         SELECT %s
             FROM swh_revision_get_by(%%s, %%s, %%s)
             LIMIT %%s
         ''' % ', '.join(self.revision_get_cols)
 
         cur.execute(query, (origin_id, branch_name, datetime, limit))
         yield from cursor_to_bytes(cur)
 
     def directory_entry_get_by_path(self, directory, paths, cur=None):
         """Retrieve a directory entry by path.
 
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT dir_id, type, target, name, perms, status, sha1,
                        sha1_git, sha256
                        FROM swh_find_directory_entry_by_path(%s, %s)""",
                     (directory, paths))
 
         data = cur.fetchone()
         if set(data) == {None}:
             return None
         return line_to_bytes(data)
 
     def entity_get(self, uuid, cur=None):
         """Retrieve the entity and its parent hierarchy chain per uuid.
 
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT %s
                        FROM swh_entity_get(%%s)""" % (
                            ', '.join(self.entity_cols)),
                     (uuid, ))
         yield from cursor_to_bytes(cur)
 
     def entity_get_one(self, uuid, cur=None):
         """Retrieve a single entity given its uuid.
 
         """
         cur = self._cursor(cur)
         cur.execute("""SELECT %s
                        FROM entity
                        WHERE uuid = %%s""" % (
                            ', '.join(self.entity_cols)),
                     (uuid, ))
         data = cur.fetchone()
         if not data:
             return None
         return line_to_bytes(data)
 
     def content_mimetype_missing_from_temp(self, cur=None):
         """List missing mimetypes.
 
         """
         cur = self._cursor(cur)
         cur.execute("SELECT * FROM swh_content_mimetype_missing()")
         yield from cursor_to_bytes(cur)
 
     def content_mimetype_add_from_temp(self, conflict_update, cur=None):
         self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)",
                                   (conflict_update, ))
 
     content_mimetype_cols = ['id', 'mimetype', 'encoding']
 
     def content_mimetype_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = "SELECT %s FROM swh_content_mimetype_get()" % (
             ','.join(self.content_mimetype_cols))
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def content_language_missing_from_temp(self, cur=None):
         """List missing languages.
 
         """
         cur = self._cursor(cur)
         cur.execute("SELECT * FROM swh_content_language_missing()")
         yield from cursor_to_bytes(cur)
 
     def content_language_add_from_temp(self, conflict_update, cur=None):
         self._cursor(cur).execute("SELECT swh_content_language_add(%s)",
                                   (conflict_update, ))
 
     content_language_cols = ['id', 'lang']
 
     def content_language_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = "SELECT %s FROM swh_content_language_get()" % (
             ','.join(self.content_language_cols))
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def content_ctags_missing_from_temp(self, cur=None):
         """List missing ctags.
 
         """
         cur = self._cursor(cur)
         cur.execute("SELECT * FROM swh_content_ctags_missing()")
         yield from cursor_to_bytes(cur)
 
     def content_ctags_add_from_temp(self, conflict_update, cur=None):
         self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)",
                                   (conflict_update, ))
 
     content_ctags_cols = ['id', 'name', 'kind', 'line', 'lang']
 
     def content_ctags_get_from_temp(self, cur=None):
         cur = self._cursor(cur)
         query = "SELECT %s FROM swh_content_ctags_get()" % (
             ','.join(self.content_ctags_cols))
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def content_ctags_search(self, expression, limit, offset, cur=None):
         cur = self._cursor(cur)
-        query = "SELECT %s FROM swh_content_ctags_search(%%s, %%s, %%s)" % (
-            ','.join(self.content_ctags_cols))
-        cur.execute(query, (expression, limit, offset))
-        yield from cursor_to_bytes(cur)
+        try:
+            query = """SELECT %s
+                       FROM swh_content_ctags_search(%%s, %%s, %%s)""" % (
+                ','.join(self.content_ctags_cols))
+            cur.execute(query, (expression, limit, offset))
+            yield from cursor_to_bytes(cur)
+        except psycopg2.InternalError as e:  # 'expression' is incorrect
+            raise BadSyntaxAPIError("Bad syntax in expression '%s'" %
+                                    expression)
 
     def content_fossology_license_missing_from_temp(self, cur=None):
         """List missing licenses.
 
         """
         cur = self._cursor(cur)
         cur.execute("SELECT * FROM swh_content_fossology_license_missing()")
         yield from cursor_to_bytes(cur)
 
     def content_fossology_license_add_from_temp(self, conflict_update,
                                                 cur=None):
         """Add new licenses per content.
 
         """
         self._cursor(cur).execute(
             "SELECT swh_content_fossology_license_add(%s)",
             (conflict_update, ))
 
     content_fossology_license_cols = ['id', 'tool_name', 'tool_version',
                                       'licenses']
 
     def content_fossology_license_get_from_temp(self, cur=None):
         """Retrieve licenses per content.
 
         """
         cur = self._cursor(cur)
         query = "SELECT %s FROM swh_content_fossology_license_get()" % (
             ','.join(self.content_fossology_license_cols))
         cur.execute(query)
         yield from cursor_to_bytes(cur)
 
     def content_fossology_license_unknown(self, cur=None):
         """Returns the unknown licenses from
            tmp_content_fossology_license_unknown.
 
         """
         cur = self._cursor(cur)
         cur.execute("SELECT * FROM swh_content_fossology_license_unknown()")
         yield from cursor_to_bytes(cur)
diff --git a/swh/storage/exc.py b/swh/storage/exc.py
index c3bf1c118..ed5e1ae1f 100644
--- a/swh/storage/exc.py
+++ b/swh/storage/exc.py
@@ -1,23 +1,29 @@
-# Copyright (C) 2015  The Software Heritage developers
+# Copyright (C) 2015-2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 class StorageDBError(Exception):
     """Specific storage db error (connection, erroneous queries, etc...)
 
     """
 
     def __str__(self):
         return 'An unexpected error occurred in the backend: %s' % self.args
 
 
 class StorageAPIError(Exception):
     """Specific internal storage api (mainly connection)
 
     """
 
     def __str__(self):
         args = self.args
         return 'An unexpected error occurred in the api backend: %s' % args
+
+
+class BadSyntaxAPIError(Exception):
+    def __str__(self):
+        args = self.args
+        return 'Syntax error in the request: %s' % args