diff --git a/sql/swh-func.sql b/sql/swh-func.sql index 82d0955..0116b8c 100644 --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1,731 +1,698 @@ -- Postgresql index helper function create or replace function hash_sha1(text) returns text language sql strict immutable as $$ select encode(public.digest($1, 'sha1'), 'hex') $$; comment on function hash_sha1(text) is 'Compute sha1 hash as text'; -- create a temporary table with a single "bytea" column for fast object lookup. create or replace function swh_mktemp_bytea() returns void language sql as $$ create temporary table tmp_bytea ( id bytea ) on commit drop; $$; -- create a temporary table called tmp_TBLNAME, mimicking existing table -- TBLNAME -- -- Args: -- tblname: name of the table to mimick create or replace function swh_mktemp(tblname regclass) returns void language plpgsql as $$ begin execute format(' create temporary table tmp_%1$I (like %1$I including defaults) on commit drop; alter table tmp_%1$I drop column if exists object_id; ', tblname); return; end $$; --- create a temporary table for content_ctags tmp_content_mimetype_missing, -create or replace function swh_mktemp_content_mimetype_missing() - returns void - language sql -as $$ - create temporary table tmp_content_mimetype_missing ( - id sha1, - indexer_configuration_id bigint - ) on commit drop; -$$; - -comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information'; - --- check which entries of tmp_bytea are missing from content_mimetype --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_mimetype_missing as tmp - where not exists - (select 1 from content_mimetype as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id)); - return; -end -$$; - -comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information'; - -- create a temporary table for content_mimetype tmp_content_mimetype, create or replace function swh_mktemp_content_mimetype() returns void language sql as $$ create temporary table tmp_content_mimetype ( like content_mimetype including defaults ) on commit drop; $$; comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information'; -- add tmp_content_mimetype entries to content_mimetype, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_content_mimetype_missing must take place before calling this -- function. -- -- -- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype, -- 2. call this function create or replace function swh_content_mimetype_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) select id, mimetype, encoding, indexer_configuration_id from tmp_content_mimetype tcm on conflict(id, indexer_configuration_id) do update set mimetype = excluded.mimetype, encoding = excluded.encoding; else insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) select id, mimetype, encoding, indexer_configuration_id from tmp_content_mimetype tcm on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; create type content_mimetype_signature as( id sha1, mimetype bytea, encoding bytea, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb ); -- Retrieve list of content mimetype from the temporary table. -- -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, -- 2. call this function create or replace function swh_content_mimetype_get() returns setof content_mimetype_signature language plpgsql as $$ begin return query select c.id, mimetype, encoding, i.id as tool_id, tool_name, tool_version, tool_configuration from tmp_bytea t inner join content_mimetype c on c.id=t.id inner join indexer_configuration i on c.indexer_configuration_id=i.id; return; end $$; comment on function swh_content_mimetype_get() IS 'List content''s mimetypes'; -- create a temporary table for content_language tmp_content_language, create or replace function swh_mktemp_content_language_missing() returns void language sql as $$ create temporary table tmp_content_language_missing ( id sha1, indexer_configuration_id integer ) on commit drop; $$; comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language'; -- check which entries of tmp_bytea are missing from content_language -- -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, -- 2. call this function create or replace function swh_content_language_missing() returns setof sha1 language plpgsql as $$ begin return query select id::sha1 from tmp_content_language_missing as tmp where not exists (select 1 from content_language as c where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); return; end $$; comment on function swh_content_language_missing() IS 'Filter missing content languages'; -- add tmp_content_language entries to content_language, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_content_language_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_content_language, 2. call this function create or replace function swh_content_language_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into content_language (id, lang, indexer_configuration_id) select id, lang, indexer_configuration_id from tmp_content_language tcl on conflict(id, indexer_configuration_id) do update set lang = excluded.lang; else insert into content_language (id, lang, indexer_configuration_id) select id, lang, indexer_configuration_id from tmp_content_language tcl on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_content_language_add(boolean) IS 'Add new content languages'; -- create a temporary table for retrieving content_language create or replace function swh_mktemp_content_language() returns void language sql as $$ create temporary table tmp_content_language ( like content_language including defaults ) on commit drop; $$; comment on function swh_mktemp_content_language() is 'Helper table to add content language'; create type content_language_signature as ( id sha1, lang languages, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb ); -- Retrieve list of content language from the temporary table. -- -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function create or replace function swh_content_language_get() returns setof content_language_signature language plpgsql as $$ begin return query select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration from tmp_bytea t inner join content_language c on c.id = t.id inner join indexer_configuration i on i.id=c.indexer_configuration_id; return; end $$; comment on function swh_content_language_get() is 'List content''s language'; -- create a temporary table for content_ctags tmp_content_ctags, create or replace function swh_mktemp_content_ctags() returns void language sql as $$ create temporary table tmp_content_ctags ( like content_ctags including defaults ) on commit drop; $$; comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; -- add tmp_content_ctags entries to content_ctags, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags, -- 2. call this function create or replace function swh_content_ctags_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then delete from content_ctags where id in (select tmp.id from tmp_content_ctags tmp inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); end if; insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) select id, name, kind, line, lang, indexer_configuration_id from tmp_content_ctags tct on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) do nothing; return; end $$; comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; -- create a temporary table for content_ctags missing routine create or replace function swh_mktemp_content_ctags_missing() returns void language sql as $$ create temporary table tmp_content_ctags_missing ( id sha1, indexer_configuration_id integer ) on commit drop; $$; comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags'; -- check which entries of tmp_bytea are missing from content_ctags -- -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, -- 2. call this function create or replace function swh_content_ctags_missing() returns setof sha1 language plpgsql as $$ begin return query (select id::sha1 from tmp_content_ctags_missing as tmp where not exists (select 1 from content_ctags as c where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id limit 1)); return; end $$; comment on function swh_content_ctags_missing() IS 'Filter missing content ctags'; create type content_ctags_signature as ( id sha1, name text, kind text, line bigint, lang ctags_languages, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb ); -- Retrieve list of content ctags from the temporary table. -- -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function create or replace function swh_content_ctags_get() returns setof content_ctags_signature language plpgsql as $$ begin return query select c.id, c.name, c.kind, c.line, c.lang, i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration from tmp_bytea t inner join content_ctags c using(id) inner join indexer_configuration i on i.id = c.indexer_configuration_id order by line; return; end $$; comment on function swh_content_ctags_get() IS 'List content ctags'; -- Search within ctags content. -- create or replace function swh_content_ctags_search( expression text, l integer default 10, last_sha1 sha1 default '\x0000000000000000000000000000000000000000') returns setof content_ctags_signature language sql as $$ select c.id, name, kind, line, lang, i.id as tool_id, tool_name, tool_version, tool_configuration from content_ctags c inner join indexer_configuration i on i.id = c.indexer_configuration_id where hash_sha1(name) = hash_sha1(expression) and c.id > last_sha1 order by id limit l; $$; comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols'; -- create a temporary table for content_fossology_license tmp_content_fossology_license, create or replace function swh_mktemp_content_fossology_license() returns void language sql as $$ create temporary table tmp_content_fossology_license ( id sha1, license text, indexer_configuration_id integer ) on commit drop; $$; comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license'; -- add tmp_content_fossology_license entries to content_fossology_license, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to -- tmp_content_fossology_license, 2. call this function create or replace function swh_content_fossology_license_add(conflict_update boolean) returns void language plpgsql as $$ begin -- insert unknown licenses first insert into fossology_license (name) select distinct license from tmp_content_fossology_license tmp where not exists (select 1 from fossology_license where name=tmp.license) on conflict(name) do nothing; if conflict_update then -- delete from content_fossology_license c -- using tmp_content_fossology_license tmp, indexer_configuration i -- where c.id = tmp.id and i.id=tmp.indexer_configuration_id delete from content_fossology_license where id in (select tmp.id from tmp_content_fossology_license tmp inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); end if; insert into content_fossology_license (id, license_id, indexer_configuration_id) select tcl.id, (select id from fossology_license where name = tcl.license) as license, indexer_configuration_id from tmp_content_fossology_license tcl on conflict(id, license_id, indexer_configuration_id) do nothing; return; end $$; comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; create type content_fossology_license_signature as ( id sha1, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb, licenses text[] ); -- Retrieve list of content license from the temporary table. -- -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, -- 2. call this function create or replace function swh_content_fossology_license_get() returns setof content_fossology_license_signature language plpgsql as $$ begin return query select cl.id, ic.id as tool_id, ic.tool_name, ic.tool_version, ic.tool_configuration, array(select name from fossology_license where id = ANY(array_agg(cl.license_id))) as licenses from tmp_bytea tcl inner join content_fossology_license cl using(id) inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration; return; end $$; comment on function swh_content_fossology_license_get() IS 'List content licenses'; -- content_metadata functions -- -- create a temporary table for content_metadata tmp_content_metadata, create or replace function swh_mktemp_content_metadata_missing() returns void language sql as $$ create temporary table tmp_content_metadata_missing ( id sha1, indexer_configuration_id integer ) on commit drop; $$; comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; -- check which entries of tmp_bytea are missing from content_metadata -- -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, -- 2. call this function create or replace function swh_content_metadata_missing() returns setof sha1 language plpgsql as $$ begin return query select id::sha1 from tmp_content_metadata_missing as tmp where not exists (select 1 from content_metadata as c where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); return; end $$; comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; -- add tmp_content_metadata entries to content_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_content_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_content_metadata, 2. call this function create or replace function swh_content_metadata_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into content_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else insert into content_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; -- create a temporary table for retrieving content_metadata create or replace function swh_mktemp_content_metadata() returns void language sql as $$ create temporary table tmp_content_metadata ( like content_metadata including defaults ) on commit drop; $$; comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; -- create type content_metadata_signature as ( id sha1, translated_metadata jsonb, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb ); -- Retrieve list of content metadata from the temporary table. -- -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function create or replace function swh_content_metadata_get() returns setof content_metadata_signature language plpgsql as $$ begin return query select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration from tmp_bytea t inner join content_metadata c on c.id = t.id inner join indexer_configuration i on i.id=c.indexer_configuration_id; return; end $$; comment on function swh_content_metadata_get() is 'List content''s metadata'; -- end content_metadata functions -- revision_metadata functions -- -- create a temporary table for revision_metadata tmp_revision_metadata, create or replace function swh_mktemp_revision_metadata_missing() returns void language sql as $$ create temporary table tmp_revision_metadata_missing ( id sha1_git, indexer_configuration_id integer ) on commit drop; $$; comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; -- check which entries of tmp_bytea are missing from revision_metadata -- -- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, -- 2. call this function create or replace function swh_revision_metadata_missing() returns setof sha1 language plpgsql as $$ begin return query select id::sha1 from tmp_revision_metadata_missing as tmp where not exists (select 1 from revision_metadata as c where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); return; end $$; comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; -- add tmp_revision_metadata entries to revision_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_revision_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_revision_metadata, 2. call this function create or replace function swh_revision_metadata_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into revision_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else insert into revision_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; -- create a temporary table for retrieving revision_metadata create or replace function swh_mktemp_revision_metadata() returns void language sql as $$ create temporary table tmp_revision_metadata ( like revision_metadata including defaults ) on commit drop; $$; comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; -- create type revision_metadata_signature as ( id sha1_git, translated_metadata jsonb, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb ); -- Retrieve list of revision metadata from the temporary table. -- -- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function create or replace function swh_revision_metadata_get() returns setof revision_metadata_signature language plpgsql as $$ begin return query select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration from tmp_bytea t inner join revision_metadata c on c.id = t.id inner join indexer_configuration i on i.id=c.indexer_configuration_id; return; end $$; create or replace function swh_mktemp_indexer_configuration() returns void language sql as $$ create temporary table tmp_indexer_configuration ( like indexer_configuration including defaults ) on commit drop; alter table tmp_indexer_configuration drop column id; $$; -- add tmp_indexer_configuration entries to indexer_configuration, -- skipping duplicates if any. -- -- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to -- it, 2. call this function to insert and filtering out duplicates create or replace function swh_indexer_configuration_add() returns setof indexer_configuration language plpgsql as $$ begin insert into indexer_configuration(tool_name, tool_version, tool_configuration) select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp on conflict(tool_name, tool_version, tool_configuration) do nothing; return query select id, tool_name, tool_version, tool_configuration from tmp_indexer_configuration join indexer_configuration using(tool_name, tool_version, tool_configuration); return; end $$; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql index b950793..6151217 100644 --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -1,138 +1,138 @@ --- --- Software Heritage Indexers Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(114, now(), 'Work In Progress'); + values(115, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); create table indexer_configuration ( id serial not null, tool_name text not null, tool_version text not null, tool_configuration jsonb ); comment on table indexer_configuration is 'Indexer''s configuration version'; comment on column indexer_configuration.id is 'Tool identifier'; comment on column indexer_configuration.tool_version is 'Tool name'; comment on column indexer_configuration.tool_version is 'Tool version'; comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; -- Properties (mimetype, encoding, etc...) create table content_mimetype ( id sha1 not null, mimetype bytea not null, encoding bytea not null, indexer_configuration_id bigint not null ); comment on table content_mimetype is 'Metadata associated to a raw content'; comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; -- Language metadata create table content_language ( id sha1 not null, lang languages not null, indexer_configuration_id bigint not null ); comment on table content_language is 'Language information on a raw content'; comment on column content_language.lang is 'Language information'; comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; -- ctags information per content create table content_ctags ( id sha1 not null, name text not null, kind text not null, line bigint not null, lang ctags_languages not null, indexer_configuration_id bigint not null ); comment on table content_ctags is 'Ctags information on a raw content'; comment on column content_ctags.id is 'Content identifier'; comment on column content_ctags.name is 'Symbol name'; comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; comment on column content_ctags.line is 'Symbol line'; comment on column content_ctags.lang is 'Language information for that content'; comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; create table fossology_license( id smallserial, name text not null ); comment on table fossology_license is 'Possible license recognized by license indexer'; comment on column fossology_license.id is 'License identifier'; comment on column fossology_license.name is 'License name'; create table content_fossology_license ( id sha1 not null, license_id smallserial not null, indexer_configuration_id bigint not null ); comment on table content_fossology_license is 'license associated to a raw content'; comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; -- The table content_metadata provides a translation to files -- identified as potentially containning metadata with a translation tool (indexer_configuration_id) create table content_metadata( id sha1 not null, translated_metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table content_metadata is 'metadata semantically translated from a content file'; comment on column content_metadata.id is 'sha1 of content file'; comment on column content_metadata.translated_metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; -- The table revision_metadata provides a minimal set of intrinsic metadata -- detected with the detection tool (indexer_configuration_id) and aggregated -- from the content_metadata translation. create table revision_metadata( id sha1_git not null, translated_metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; comment on column revision_metadata.id is 'sha1_git of revision'; comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; create table origin_metadata_translation( id bigserial not null, -- PK origin_metadata identifier result jsonb, tool_id bigint ); comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; comment on column origin_metadata_translation.tool_id is 'tool used for translation'; diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql new file mode 100644 index 0000000..110f87a --- /dev/null +++ b/sql/upgrades/115.sql @@ -0,0 +1,10 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 114 +-- to_version: 115 +-- description: Remove temporary table use in reading api + +insert into dbversion(version, release, description) +values(115, now(), 'Work In Progress'); + +drop function swh_mktemp_content_mimetype_missing(); +drop function swh_content_mimetype_missing(); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 20fd5e4..0051953 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,538 +1,534 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import dateutil.parser import psycopg2 from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db from . import converters INDEXER_CFG_KEY = 'indexer_storage' def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: args (dict): dictionary with keys: - cls (str): storage's class, either 'local' or 'remote' - args (dict): dictionary with keys Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) class IndexerStorage(): """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) def check_config(self, *, check_write): """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables with self.get_db().transaction() as cur: if check_write: check = 'INSERT' else: check = 'SELECT' cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,) ) return cur.fetchone()[0] return True @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): """List mimetypes missing from storage. Args: mimetypes (iterable): iterable of dict with keys: - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + - indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: an iterable of missing id for the triplets id, tool_name, - tool_version + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_mimetype_missing(cur) - db.copy_to(mimetypes, 'tmp_content_mimetype_missing', - ['id', 'indexer_configuration_id'], - cur) - for obj in db.content_mimetype_missing_from_temp(cur): + for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): """Add mimetypes not present in storage. Args: mimetypes (iterable): dictionaries with keys: - id (bytes): sha1 identifier - mimetype (bytes): raw content's mimetype - encoding (bytes): raw content's encoding - indexer_configuration_id (int): tool's id used to compute the results - conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): db.store_tmp_bytea(ids, cur) for c in db.content_mimetype_get_from_temp(): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): """List languages missing from storage. Args: languages (iterable): dictionaries with keys: - id (bytes): sha1 identifier - tool_name (str): tool used to compute the results - tool_version (str): associated tool's version Returns: iterable: identifiers of missing languages """ db.mktemp_content_language_missing(cur) db.copy_to(languages, 'tmp_content_language_missing', ['id', 'indexer_configuration_id'], cur) for obj in db.content_language_missing_from_temp(cur): yield obj[0] @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): db.store_tmp_bytea(ids, cur) for c in db.content_language_get_from_temp(): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): """Add languages not present in storage. Args: languages (iterable): dictionaries with keys: - id: sha1 - lang: bytes conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - id (bytes): sha1 identifier - tool_name (str): tool name used - tool_version (str): associated version Returns: an iterable of missing id """ db.mktemp_content_ctags_missing(cur) db.copy_to(ctags, tblname='tmp_content_ctags_missing', columns=['id', 'indexer_configuration_id'], cur=cur) for obj in db.content_ctags_missing_from_temp(cur): yield obj[0] @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums """ db.store_tmp_bytea(ids, cur) for c in db.content_ctags_get_from_temp(): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - id (bytes): sha1 - ctags ([list): List of dictionary with keys: name, kind, line, language """ def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): """Retrieve licenses per id. Args: ids (iterable): sha1 checksums Yields: list: dictionaries with the following keys: - id (bytes) - licenses ([str]): associated licenses for that content """ db.store_tmp_bytea(ids, cur) for c in db.content_fossology_license_get_from_temp(): license = dict(zip(db.content_fossology_license_cols, c)) yield converters.db_to_fossology_license(license) @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): """Add licenses not present in storage. Args: licenses (iterable): dictionaries with keys: - id: sha1 - license ([bytes]): List of licenses associated to sha1 - tool (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) Returns: list: content_license entries which failed due to unknown licenses """ # Then, we add the correct ones db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_metadata_missing(self, metadatas, db=None, cur=None): """List metadatas missing from storage. Args: metadatas (iterable): dictionaries with keys: - id (bytes): sha1 identifier - tool_name (str): tool used to compute the results - tool_version (str): associated tool's version Returns: iterable: missing ids """ db.mktemp_content_metadata_missing(cur) db.copy_to(metadatas, 'tmp_content_metadata_missing', ['id', 'indexer_configuration_id'], cur) for obj in db.content_metadata_missing_from_temp(cur): yield obj[0] @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): db.store_tmp_bytea(ids, cur) for c in db.content_metadata_get_from_temp(): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @db_transaction() def content_metadata_add(self, metadatas, conflict_update=False, db=None, cur=None): """Add metadatas not present in storage. Args: metadatas (iterable): dictionaries with keys: - id: sha1 - translated_metadata: bytes / jsonb ? conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_metadata(cur) # empty metadata is mapped to 'unknown' db.copy_to(metadatas, 'tmp_content_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @db_transaction_generator() def revision_metadata_missing(self, metadatas, db=None, cur=None): """List metadatas missing from storage. Args: metadatas (iterable): dictionaries with keys: - id (bytes): sha1_git revision identifier - tool_name (str): tool used to compute the results - tool_version (str): associated tool's version Returns: iterable: missing ids """ db.mktemp_revision_metadata_missing(cur) db.copy_to(metadatas, 'tmp_revision_metadata_missing', ['id', 'indexer_configuration_id'], cur) for obj in db.revision_metadata_missing_from_temp(cur): yield obj[0] @db_transaction_generator() def revision_metadata_get(self, ids, db=None, cur=None): db.store_tmp_bytea(ids, cur) for c in db.revision_metadata_get_from_temp(): yield converters.db_to_metadata( dict(zip(db.revision_metadata_cols, c))) @db_transaction() def revision_metadata_add(self, metadatas, conflict_update=False, db=None, cur=None): """Add metadatas not present in storage. Args: metadatas (iterable): dictionaries with keys: - id: sha1_git of revision - translated_metadata: bytes / jsonb ? conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_revision_metadata(cur) # empty metadata is mapped to 'unknown' db.copy_to(metadatas, 'tmp_revision_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) @db_transaction() def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, db=None, cur=None): """ Add an origin_metadata for the origin at ts with provenance and metadata. Args: origin_id (int): the origin's id for which the metadata is added ts (datetime): timestamp of the found metadata provider (int): the provider of metadata (ex:'hal') tool (int): tool used to extract metadata metadata (jsonb): the metadata retrieved at the time and location Returns: id (int): the origin_metadata unique id """ if isinstance(ts, str): ts = dateutil.parser.parse(ts) return db.origin_metadata_add(origin_id, ts, provider, tool, metadata, cur) @db_transaction_generator() def origin_metadata_get_by(self, origin_id, provider_type=None, db=None, cur=None): """Retrieve list of all origin_metadata entries for the origin_id Args: origin_id (int): the unique origin identifier provider_type (str): (optional) type of provider Returns: list of dicts: the origin_metadata dictionary with the keys: - id (int): origin_metadata's id - origin_id (int): origin's id - discovery_date (datetime): timestamp of discovery - tool_id (int): metadata's extracting tool - metadata (jsonb) - provider_id (int): metadata's provider - provider_name (str) - provider_type (str) - provider_url (str) """ for line in db.origin_metadata_get_by(origin_id, provider_type, cur): yield dict(zip(db.origin_metadata_get_cols, line)) @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys:: tool_name (str): tool's name tool_version (str): tool's version tool_configuration (dict): tool's configuration (free form dict) Returns: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys:: tool_name (str): tool's name tool_version (str): tool's version tool_configuration (dict): tool's configuration (free form dict) Returns: The identifier of the tool if it exists, None otherwise. """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py index b51402e..3a7341d 100644 --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,245 +1,258 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model import hashutil from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes -from swh.storage.db import line_to_bytes +from swh.storage.db import line_to_bytes, execute_values_to_bytes class Db(BaseDb): """Proxy to the SWH Indexer DB, with wrappers around stored procedures """ @stored_procedure('swh_mktemp_bytea') def mktemp_bytea(self, cur=None): pass def store_tmp_bytea(self, ids, cur=None): """Store the given identifiers in a new tmp_bytea table""" cur = self._cursor(cur) self.mktemp_bytea(cur) self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', ['id'], cur) - content_mimetype_cols = [ - 'id', 'mimetype', 'encoding', - 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] - @stored_procedure('swh_mktemp_content_mimetype_missing') - def mktemp_content_mimetype_missing(self, cur=None): pass - - def content_mimetype_missing_from_temp(self, cur=None): + def content_mimetype_missing_from_list(self, mimetypes, cur=None): """List missing mimetypes. """ cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_mimetype_missing()") - yield from cursor_to_bytes(cur) + keys = ', '.join(self.content_mimetype_hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) + for key in self.content_mimetype_hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from content_mimetype c + where %s + ) + """ % (keys, keys, equality), + (tuple(m[k] for k in self.content_mimetype_hash_keys) + for m in mimetypes) + ) + + content_mimetype_cols = [ + 'id', 'mimetype', 'encoding', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_mimetype') def mktemp_content_mimetype(self, cur=None): pass def content_mimetype_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", (conflict_update, )) def content_mimetype_get_from_temp(self, cur=None): cur = self._cursor(cur) query = "SELECT %s FROM swh_content_mimetype_get()" % ( ','.join(self.content_mimetype_cols)) cur.execute(query) yield from cursor_to_bytes(cur) content_language_cols = [ 'id', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_language') def mktemp_content_language(self, cur=None): pass @stored_procedure('swh_mktemp_content_language_missing') def mktemp_content_language_missing(self, cur=None): pass def content_language_missing_from_temp(self, cur=None): """List missing languages. """ cur = self._cursor(cur) cur.execute("SELECT * FROM swh_content_language_missing()") yield from cursor_to_bytes(cur) def content_language_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_language_add(%s)", (conflict_update, )) def content_language_get_from_temp(self, cur=None): cur = self._cursor(cur) query = "SELECT %s FROM swh_content_language_get()" % ( ','.join(self.content_language_cols)) cur.execute(query) yield from cursor_to_bytes(cur) content_ctags_cols = [ 'id', 'name', 'kind', 'line', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_ctags') def mktemp_content_ctags(self, cur=None): pass @stored_procedure('swh_mktemp_content_ctags_missing') def mktemp_content_ctags_missing(self, cur=None): pass def content_ctags_missing_from_temp(self, cur=None): """List missing ctags. """ cur = self._cursor(cur) cur.execute("SELECT * FROM swh_content_ctags_missing()") yield from cursor_to_bytes(cur) def content_ctags_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", (conflict_update, )) def content_ctags_get_from_temp(self, cur=None): cur = self._cursor(cur) query = "SELECT %s FROM swh_content_ctags_get()" % ( ','.join(self.content_ctags_cols)) cur.execute(query) yield from cursor_to_bytes(cur) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) if not last_sha1: query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit)) else: if last_sha1 and isinstance(last_sha1, bytes): last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1) elif last_sha1: last_sha1 = '\\x%s' % last_sha1 query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit, last_sha1)) yield from cursor_to_bytes(cur) content_fossology_license_cols = [ 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', 'licenses'] @stored_procedure('swh_mktemp_content_fossology_license') def mktemp_content_fossology_license(self, cur=None): pass def content_fossology_license_add_from_temp(self, conflict_update, cur=None): """Add new licenses per content. """ self._cursor(cur).execute( "SELECT swh_content_fossology_license_add(%s)", (conflict_update, )) def content_fossology_license_get_from_temp(self, cur=None): """Retrieve licenses per content. """ cur = self._cursor(cur) query = "SELECT %s FROM swh_content_fossology_license_get()" % ( ','.join(self.content_fossology_license_cols)) cur.execute(query) yield from cursor_to_bytes(cur) content_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_metadata') def mktemp_content_metadata(self, cur=None): pass @stored_procedure('swh_mktemp_content_metadata_missing') def mktemp_content_metadata_missing(self, cur=None): pass def content_metadata_missing_from_temp(self, cur=None): """List missing metadatas. """ cur = self._cursor(cur) cur.execute("SELECT * FROM swh_content_metadata_missing()") yield from cursor_to_bytes(cur) def content_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", (conflict_update, )) def content_metadata_get_from_temp(self, cur=None): cur = self._cursor(cur) query = "SELECT %s FROM swh_content_metadata_get()" % ( ','.join(self.content_metadata_cols)) cur.execute(query) yield from cursor_to_bytes(cur) revision_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') def mktemp_revision_metadata(self, cur=None): pass @stored_procedure('swh_mktemp_revision_metadata_missing') def mktemp_revision_metadata_missing(self, cur=None): pass def revision_metadata_missing_from_temp(self, cur=None): """List missing metadatas. """ cur = self._cursor(cur) cur.execute("SELECT * FROM swh_revision_metadata_missing()") yield from cursor_to_bytes(cur) def revision_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", (conflict_update, )) def revision_metadata_get_from_temp(self, cur=None): cur = self._cursor(cur) query = "SELECT %s FROM swh_revision_metadata_get()" % ( ','.join(self.revision_metadata_cols)) cur.execute(query) yield from cursor_to_bytes(cur) indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_indexer_configuration') def mktemp_indexer_configuration(self, cur=None): pass def indexer_configuration_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( ','.join(self.indexer_configuration_cols), )) yield from cursor_to_bytes(cur) def indexer_configuration_get(self, tool_name, tool_version, tool_configuration, cur=None): cur = self._cursor(cur) cur.execute('''select %s from indexer_configuration where tool_name=%%s and tool_version=%%s and tool_configuration=%%s''' % ( ','.join(self.indexer_configuration_cols)), (tool_name, tool_version, tool_configuration)) data = cur.fetchone() if not data: return None return line_to_bytes(data)