diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..650fb84 --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1 @@ +Siddharth Ravikumar diff --git a/PKG-INFO b/PKG-INFO index 365f08a..e985a5a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.indexer -Version: 0.0.51 +Version: 0.0.52 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/sql/swh-func.sql b/sql/swh-func.sql index 82d0955..c4096f6 100644 --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -1,731 +1,381 @@ -- Postgresql index helper function create or replace function hash_sha1(text) returns text language sql strict immutable as $$ select encode(public.digest($1, 'sha1'), 'hex') $$; comment on function hash_sha1(text) is 'Compute sha1 hash as text'; --- create a temporary table with a single "bytea" column for fast object lookup. -create or replace function swh_mktemp_bytea() - returns void - language sql -as $$ - create temporary table tmp_bytea ( - id bytea - ) on commit drop; -$$; - -- create a temporary table called tmp_TBLNAME, mimicking existing table -- TBLNAME -- -- Args: -- tblname: name of the table to mimick create or replace function swh_mktemp(tblname regclass) returns void language plpgsql as $$ begin execute format(' create temporary table tmp_%1$I (like %1$I including defaults) on commit drop; alter table tmp_%1$I drop column if exists object_id; ', tblname); return; end $$; --- create a temporary table for content_ctags tmp_content_mimetype_missing, -create or replace function swh_mktemp_content_mimetype_missing() - returns void - language sql -as $$ - create temporary table tmp_content_mimetype_missing ( - id sha1, - indexer_configuration_id bigint - ) on commit drop; -$$; - -comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information'; - --- check which entries of tmp_bytea are missing from content_mimetype --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_mimetype_missing as tmp - where not exists - (select 1 from content_mimetype as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id)); - return; -end -$$; - -comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information'; - -- create a temporary table for content_mimetype tmp_content_mimetype, create or replace function swh_mktemp_content_mimetype() returns void language sql as $$ create temporary table tmp_content_mimetype ( like content_mimetype including defaults ) on commit drop; $$; comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information'; -- add tmp_content_mimetype entries to content_mimetype, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_content_mimetype_missing must take place before calling this -- function. -- -- -- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype, -- 2. call this function create or replace function swh_content_mimetype_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) select id, mimetype, encoding, indexer_configuration_id from tmp_content_mimetype tcm on conflict(id, indexer_configuration_id) do update set mimetype = excluded.mimetype, encoding = excluded.encoding; else insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) select id, mimetype, encoding, indexer_configuration_id from tmp_content_mimetype tcm on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; -create type content_mimetype_signature as( - id sha1, - mimetype bytea, - encoding bytea, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content mimetype from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_get() - returns setof content_mimetype_signature - language plpgsql -as $$ -begin - return query - select c.id, mimetype, encoding, - i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_mimetype c on c.id=t.id - inner join indexer_configuration i on c.indexer_configuration_id=i.id; - return; -end -$$; - -comment on function swh_content_mimetype_get() IS 'List content''s mimetypes'; - --- create a temporary table for content_language tmp_content_language, -create or replace function swh_mktemp_content_language_missing() - returns void - language sql -as $$ - create temporary table tmp_content_language_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language'; - --- check which entries of tmp_bytea are missing from content_language --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_language_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_content_language_missing as tmp - where not exists - (select 1 from content_language as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_content_language_missing() IS 'Filter missing content languages'; - -- add tmp_content_language entries to content_language, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_content_language_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_content_language, 2. call this function create or replace function swh_content_language_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into content_language (id, lang, indexer_configuration_id) select id, lang, indexer_configuration_id from tmp_content_language tcl on conflict(id, indexer_configuration_id) do update set lang = excluded.lang; else insert into content_language (id, lang, indexer_configuration_id) select id, lang, indexer_configuration_id from tmp_content_language tcl on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_content_language_add(boolean) IS 'Add new content languages'; -- create a temporary table for retrieving content_language create or replace function swh_mktemp_content_language() returns void language sql as $$ create temporary table tmp_content_language ( like content_language including defaults ) on commit drop; $$; comment on function swh_mktemp_content_language() is 'Helper table to add content language'; -create type content_language_signature as ( - id sha1, - lang languages, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content language from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_language_get() - returns setof content_language_signature - language plpgsql -as $$ -begin - return query - select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_language c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - -comment on function swh_content_language_get() is 'List content''s language'; - -- create a temporary table for content_ctags tmp_content_ctags, create or replace function swh_mktemp_content_ctags() returns void language sql as $$ create temporary table tmp_content_ctags ( like content_ctags including defaults ) on commit drop; $$; comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; -- add tmp_content_ctags entries to content_ctags, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags, -- 2. call this function create or replace function swh_content_ctags_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then delete from content_ctags where id in (select tmp.id from tmp_content_ctags tmp inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); end if; insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) select id, name, kind, line, lang, indexer_configuration_id from tmp_content_ctags tct on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) do nothing; return; end $$; comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; --- create a temporary table for content_ctags missing routine -create or replace function swh_mktemp_content_ctags_missing() - returns void - language sql -as $$ - create temporary table tmp_content_ctags_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags'; - --- check which entries of tmp_bytea are missing from content_ctags --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_ctags_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_ctags_missing as tmp - where not exists - (select 1 from content_ctags as c - where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id - limit 1)); - return; -end -$$; - -comment on function swh_content_ctags_missing() IS 'Filter missing content ctags'; - create type content_ctags_signature as ( id sha1, name text, kind text, line bigint, lang ctags_languages, tool_id integer, tool_name text, tool_version text, tool_configuration jsonb ); --- Retrieve list of content ctags from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_ctags_get() - returns setof content_ctags_signature - language plpgsql -as $$ -begin - return query - select c.id, c.name, c.kind, c.line, c.lang, - i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration - from tmp_bytea t - inner join content_ctags c using(id) - inner join indexer_configuration i on i.id = c.indexer_configuration_id - order by line; - return; -end -$$; - -comment on function swh_content_ctags_get() IS 'List content ctags'; - -- Search within ctags content. -- create or replace function swh_content_ctags_search( expression text, l integer default 10, last_sha1 sha1 default '\x0000000000000000000000000000000000000000') returns setof content_ctags_signature language sql as $$ select c.id, name, kind, line, lang, i.id as tool_id, tool_name, tool_version, tool_configuration from content_ctags c inner join indexer_configuration i on i.id = c.indexer_configuration_id where hash_sha1(name) = hash_sha1(expression) and c.id > last_sha1 order by id limit l; $$; comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols'; -- create a temporary table for content_fossology_license tmp_content_fossology_license, create or replace function swh_mktemp_content_fossology_license() returns void language sql as $$ create temporary table tmp_content_fossology_license ( id sha1, license text, indexer_configuration_id integer ) on commit drop; $$; comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license'; -- add tmp_content_fossology_license entries to content_fossology_license, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to -- tmp_content_fossology_license, 2. call this function create or replace function swh_content_fossology_license_add(conflict_update boolean) returns void language plpgsql as $$ begin -- insert unknown licenses first insert into fossology_license (name) select distinct license from tmp_content_fossology_license tmp where not exists (select 1 from fossology_license where name=tmp.license) on conflict(name) do nothing; if conflict_update then -- delete from content_fossology_license c -- using tmp_content_fossology_license tmp, indexer_configuration i -- where c.id = tmp.id and i.id=tmp.indexer_configuration_id delete from content_fossology_license where id in (select tmp.id from tmp_content_fossology_license tmp inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); end if; insert into content_fossology_license (id, license_id, indexer_configuration_id) select tcl.id, (select id from fossology_license where name = tcl.license) as license, indexer_configuration_id from tmp_content_fossology_license tcl on conflict(id, license_id, indexer_configuration_id) do nothing; return; end $$; comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; -create type content_fossology_license_signature as ( - id sha1, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb, - licenses text[] -); - --- Retrieve list of content license from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_fossology_license_get() - returns setof content_fossology_license_signature - language plpgsql -as $$ -begin - return query - select cl.id, - ic.id as tool_id, - ic.tool_name, - ic.tool_version, - ic.tool_configuration, - array(select name - from fossology_license - where id = ANY(array_agg(cl.license_id))) as licenses - from tmp_bytea tcl - inner join content_fossology_license cl using(id) - inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id - group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration; - return; -end -$$; - -comment on function swh_content_fossology_license_get() IS 'List content licenses'; - -- content_metadata functions --- --- create a temporary table for content_metadata tmp_content_metadata, -create or replace function swh_mktemp_content_metadata_missing() - returns void - language sql -as $$ - create temporary table tmp_content_metadata_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; - --- check which entries of tmp_bytea are missing from content_metadata --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_metadata_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_content_metadata_missing as tmp - where not exists - (select 1 from content_metadata as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; -- add tmp_content_metadata entries to content_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_content_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_content_metadata, 2. call this function create or replace function swh_content_metadata_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into content_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else insert into content_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; -- create a temporary table for retrieving content_metadata create or replace function swh_mktemp_content_metadata() returns void language sql as $$ create temporary table tmp_content_metadata ( like content_metadata including defaults ) on commit drop; $$; comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; --- -create type content_metadata_signature as ( - id sha1, - translated_metadata jsonb, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content metadata from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_metadata_get() - returns setof content_metadata_signature - language plpgsql -as $$ -begin - return query - select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_metadata c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - -comment on function swh_content_metadata_get() is 'List content''s metadata'; -- end content_metadata functions --- revision_metadata functions --- --- create a temporary table for revision_metadata tmp_revision_metadata, -create or replace function swh_mktemp_revision_metadata_missing() - returns void - language sql -as $$ - create temporary table tmp_revision_metadata_missing ( - id sha1_git, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; - --- check which entries of tmp_bytea are missing from revision_metadata --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_revision_metadata_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_revision_metadata_missing as tmp - where not exists - (select 1 from revision_metadata as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; - -- add tmp_revision_metadata entries to revision_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- -- If filtering duplicates is in order, the call to -- swh_revision_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to -- tmp_revision_metadata, 2. call this function create or replace function swh_revision_metadata_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then insert into revision_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do update set translated_metadata = excluded.translated_metadata; else insert into revision_metadata (id, translated_metadata, indexer_configuration_id) select id, translated_metadata, indexer_configuration_id from tmp_revision_metadata tcm on conflict(id, indexer_configuration_id) do nothing; end if; return; end $$; comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; -- create a temporary table for retrieving revision_metadata create or replace function swh_mktemp_revision_metadata() returns void language sql as $$ create temporary table tmp_revision_metadata ( like revision_metadata including defaults ) on commit drop; $$; comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; --- -create type revision_metadata_signature as ( - id sha1_git, - translated_metadata jsonb, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of revision metadata from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_revision_metadata_get() - returns setof revision_metadata_signature - language plpgsql -as $$ -begin - return query - select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join revision_metadata c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - create or replace function swh_mktemp_indexer_configuration() returns void language sql as $$ create temporary table tmp_indexer_configuration ( like indexer_configuration including defaults ) on commit drop; alter table tmp_indexer_configuration drop column id; $$; -- add tmp_indexer_configuration entries to indexer_configuration, -- skipping duplicates if any. -- -- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to -- it, 2. call this function to insert and filtering out duplicates create or replace function swh_indexer_configuration_add() returns setof indexer_configuration language plpgsql as $$ begin insert into indexer_configuration(tool_name, tool_version, tool_configuration) select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp on conflict(tool_name, tool_version, tool_configuration) do nothing; return query select id, tool_name, tool_version, tool_configuration from tmp_indexer_configuration join indexer_configuration using(tool_name, tool_version, tool_configuration); return; end $$; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql index b950793..6151217 100644 --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -1,138 +1,138 @@ --- --- Software Heritage Indexers Data Model --- -- drop schema if exists swh cascade; -- create schema swh; -- set search_path to swh; create table dbversion ( version int primary key, release timestamptz, description text ); insert into dbversion(version, release, description) - values(114, now(), 'Work In Progress'); + values(115, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) create domain sha1 as bytea check (length(value) = 20); -- a Git object ID, i.e., a SHA1 checksum create domain sha1_git as bytea check (length(value) = 20); create table indexer_configuration ( id serial not null, tool_name text not null, tool_version text not null, tool_configuration jsonb ); comment on table indexer_configuration is 'Indexer''s configuration version'; comment on column indexer_configuration.id is 'Tool identifier'; comment on column indexer_configuration.tool_version is 'Tool name'; comment on column indexer_configuration.tool_version is 'Tool version'; comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; -- Properties (mimetype, encoding, etc...) create table content_mimetype ( id sha1 not null, mimetype bytea not null, encoding bytea not null, indexer_configuration_id bigint not null ); comment on table content_mimetype is 'Metadata associated to a raw content'; comment on column content_mimetype.mimetype is 'Raw content Mimetype'; comment on column content_mimetype.encoding is 'Raw content encoding'; comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; -- Language metadata create table content_language ( id sha1 not null, lang languages not null, indexer_configuration_id bigint not null ); comment on table content_language is 'Language information on a raw content'; comment on column content_language.lang is 'Language information'; comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; -- ctags information per content create table content_ctags ( id sha1 not null, name text not null, kind text not null, line bigint not null, lang ctags_languages not null, indexer_configuration_id bigint not null ); comment on table content_ctags is 'Ctags information on a raw content'; comment on column content_ctags.id is 'Content identifier'; comment on column content_ctags.name is 'Symbol name'; comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; comment on column content_ctags.line is 'Symbol line'; comment on column content_ctags.lang is 'Language information for that content'; comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; create table fossology_license( id smallserial, name text not null ); comment on table fossology_license is 'Possible license recognized by license indexer'; comment on column fossology_license.id is 'License identifier'; comment on column fossology_license.name is 'License name'; create table content_fossology_license ( id sha1 not null, license_id smallserial not null, indexer_configuration_id bigint not null ); comment on table content_fossology_license is 'license associated to a raw content'; comment on column content_fossology_license.id is 'Raw content identifier'; comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; -- The table content_metadata provides a translation to files -- identified as potentially containning metadata with a translation tool (indexer_configuration_id) create table content_metadata( id sha1 not null, translated_metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table content_metadata is 'metadata semantically translated from a content file'; comment on column content_metadata.id is 'sha1 of content file'; comment on column content_metadata.translated_metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; -- The table revision_metadata provides a minimal set of intrinsic metadata -- detected with the detection tool (indexer_configuration_id) and aggregated -- from the content_metadata translation. create table revision_metadata( id sha1_git not null, translated_metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; comment on column revision_metadata.id is 'sha1_git of revision'; comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; create table origin_metadata_translation( id bigserial not null, -- PK origin_metadata identifier result jsonb, tool_id bigint ); comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; comment on column origin_metadata_translation.tool_id is 'tool used for translation'; diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql new file mode 100644 index 0000000..49dd8fa --- /dev/null +++ b/sql/upgrades/115.sql @@ -0,0 +1,42 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 114 +-- to_version: 115 +-- description: Remove temporary table use in reading api + +insert into dbversion(version, release, description) +values(115, now(), 'Work In Progress'); + +drop function swh_mktemp_content_mimetype_missing(); +drop function swh_content_mimetype_missing(); + +drop function swh_content_mimetype_get(); +drop type content_mimetype_signature; + +drop function swh_mktemp_content_language_missing(); +drop function swh_content_language_missing(); + +drop function swh_content_language_get(); +drop type content_language_signature; + +drop function swh_mktemp_content_ctags_missing(); +drop function swh_content_ctags_missing(); + +drop function swh_content_ctags_get(); +--drop type content_ctags_signature; -- still used in swh_content_ctags_search + +drop function swh_content_fossology_license_get(); +drop type content_fossology_license_signature; + +drop function swh_mktemp_content_metadata_missing(); +drop function swh_content_metadata_missing(); + +drop function swh_content_metadata_get(); +drop type content_metadata_signature; + +drop function swh_mktemp_revision_metadata_missing(); +drop function swh_revision_metadata_missing(); + +drop function swh_revision_metadata_get(); +drop type revision_metadata_signature; + +drop function swh_mktemp_bytea(); diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 365f08a..e985a5a 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.indexer -Version: 0.0.51 +Version: 0.0.52 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index ac07ca8..a59988b 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,71 +1,73 @@ .gitignore AUTHORS +CONTRIBUTORS LICENSE MANIFEST.in Makefile README codemeta.json requirements-swh.txt requirements.txt setup.py version.txt debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder sql/Makefile sql/swh-data.sql sql/swh-enums.sql sql/swh-func.sql sql/swh-indexes.sql sql/swh-init.sql sql/swh-schema.sql sql/bin/db-upgrade sql/bin/dot_add_content sql/doc/json sql/json/.gitignore sql/json/Makefile sql/json/indexer_configuration.tool_configuration.schema.json sql/json/revision_metadata.translated_metadata.json +sql/upgrades/115.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/language.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/metadata_dictionary.py swh/indexer/mimetype.py swh/indexer/orchestrator.py swh/indexer/producer.py swh/indexer/rehash.py swh/indexer/tasks.py swh/indexer/storage/__init__.py swh/indexer/storage/converters.py swh/indexer/storage/db.py swh/indexer/storage/api/__init__.py swh/indexer/storage/api/client.py swh/indexer/storage/api/server.py swh/indexer/tests/__init__.py swh/indexer/tests/test_language.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py swh/indexer/tests/test_utils.py swh/indexer/tests/storage/__init__.py swh/indexer/tests/storage/test_api_client.py swh/indexer/tests/storage/test_converters.py swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 20fd5e4..1bfaa5e 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,538 +1,541 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -import dateutil.parser import psycopg2 +from collections import defaultdict + from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db from . import converters INDEXER_CFG_KEY = 'indexer_storage' def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: args (dict): dictionary with keys: - cls (str): storage's class, either 'local' or 'remote' - args (dict): dictionary with keys Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) class IndexerStorage(): """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) def check_config(self, *, check_write): """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables with self.get_db().transaction() as cur: if check_write: check = 'INSERT' else: check = 'SELECT' cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,) ) return cur.fetchone()[0] return True @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): """List mimetypes missing from storage. Args: mimetypes (iterable): iterable of dict with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: an iterable of missing id for the triplets id, tool_name, - tool_version + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_mimetype_missing(cur) - db.copy_to(mimetypes, 'tmp_content_mimetype_missing', - ['id', 'indexer_configuration_id'], - cur) - for obj in db.content_mimetype_missing_from_temp(cur): + for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): """Add mimetypes not present in storage. Args: mimetypes (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - mimetype (bytes): raw content's mimetype - - encoding (bytes): raw content's encoding - - indexer_configuration_id (int): tool's id used to - compute the results - - conflict_update: Flag to determine if we want to - overwrite (true) or skip duplicates (false, the default) + id (bytes): sha1 identifier + mimetype (bytes): raw content's mimetype + encoding (bytes): raw content's encoding + indexer_configuration_id (int): tool's id used to + compute the results + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates + (false, the default) """ db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_mimetype_get_from_temp(): + """Retrieve full content mimetype per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + mimetypes (iterable): dictionaries with keys: + + id (bytes): sha1 identifier + mimetype (bytes): raw content's mimetype + encoding (bytes): raw content's encoding + tool (dict): Tool used to compute the language + + """ + for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): """List languages missing from storage. Args: languages (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: identifiers of missing languages + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_language_missing(cur) - db.copy_to(languages, 'tmp_content_language_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.content_language_missing_from_temp(cur): + for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_language_get_from_temp(): + """Retrieve full content language per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + languages (iterable): dictionaries with keys: + + id (bytes): sha1 identifier + lang (bytes): raw content's language + tool (dict): Tool used to compute the language + + """ + for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): """Add languages not present in storage. Args: languages (iterable): dictionaries with keys: - - id: sha1 - - lang: bytes + id (bytes): sha1 + lang (bytes): language detected - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates (false, the + default) """ db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool name used - - tool_version (str): associated version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - an iterable of missing id + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_ctags_missing(cur) - db.copy_to(ctags, - tblname='tmp_content_ctags_missing', - columns=['id', 'indexer_configuration_id'], - cur=cur) - for obj in db.content_ctags_missing_from_temp(cur): + for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums + Yields: + Dictionaries with keys: + + id (bytes): content's identifier + name (str): symbol's name + kind (str): symbol's kind + language (str): language for that content + tool (dict): tool used to compute the ctags' info + + """ - db.store_tmp_bytea(ids, cur) - for c in db.content_ctags_get_from_temp(): + for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - - id (bytes): sha1 - - ctags ([list): List of dictionary with keys: name, kind, - line, language + id (bytes): sha1 + ctags ([list): List of dictionary with keys: name, kind, + line, language """ def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): """Retrieve licenses per id. Args: ids (iterable): sha1 checksums Yields: list: dictionaries with the following keys: - - id (bytes) - - licenses ([str]): associated licenses for that content + id (bytes) + licenses ([str]): associated licenses for that content + tool (dict): Tool used to compute the license """ - db.store_tmp_bytea(ids, cur) - - for c in db.content_fossology_license_get_from_temp(): + d = defaultdict(list) + for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) - yield converters.db_to_fossology_license(license) + + id_ = license['id'] + d[id_].append(converters.db_to_fossology_license(license)) + + for id_, facts in d.items(): + yield {id_: facts} @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): """Add licenses not present in storage. Args: licenses (iterable): dictionaries with keys: - id: sha1 - license ([bytes]): List of licenses associated to sha1 - tool (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) Returns: list: content_license entries which failed due to unknown licenses """ # Then, we add the correct ones db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) @db_transaction_generator() - def content_metadata_missing(self, metadatas, db=None, cur=None): - """List metadatas missing from storage. + def content_metadata_missing(self, metadata, db=None, cur=None): + """List metadata missing from storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: missing ids + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_metadata_missing(cur) - db.copy_to(metadatas, 'tmp_content_metadata_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.content_metadata_missing_from_temp(cur): + for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_metadata_get_from_temp(): + """Retrieve metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @db_transaction() - def content_metadata_add(self, metadatas, conflict_update=False, db=None, + def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadatas not present in storage. + """Add metadata not present in storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - - id: sha1 - - translated_metadata: bytes / jsonb ? + id: sha1 + translated_metadata: bytes / jsonb ? conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_content_metadata(cur) # empty metadata is mapped to 'unknown' - db.copy_to(metadatas, 'tmp_content_metadata', + db.copy_to(metadata, 'tmp_content_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @db_transaction_generator() - def revision_metadata_missing(self, metadatas, db=None, cur=None): - """List metadatas missing from storage. + def revision_metadata_missing(self, metadata, db=None, cur=None): + """List metadata missing from storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - - id (bytes): sha1_git revision identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1_git revision identifier + indexer_configuration_id (int): tool used to compute + the results Returns: iterable: missing ids """ - db.mktemp_revision_metadata_missing(cur) - db.copy_to(metadatas, 'tmp_revision_metadata_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.revision_metadata_missing_from_temp(cur): + for obj in db.revision_metadata_missing_from_list(metadata, cur): yield obj[0] @db_transaction_generator() def revision_metadata_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.revision_metadata_get_from_temp(): + """Retrieve revision metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.revision_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_metadata_cols, c))) @db_transaction() - def revision_metadata_add(self, metadatas, conflict_update=False, db=None, + def revision_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadatas not present in storage. + """Add metadata not present in storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - id: sha1_git of revision - translated_metadata: bytes / jsonb ? conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ db.mktemp_revision_metadata(cur) # empty metadata is mapped to 'unknown' - db.copy_to(metadatas, 'tmp_revision_metadata', + db.copy_to(metadata, 'tmp_revision_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) - @db_transaction() - def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, - db=None, cur=None): - """ Add an origin_metadata for the origin at ts with provenance and - metadata. - - Args: - origin_id (int): the origin's id for which the metadata is added - ts (datetime): timestamp of the found metadata - provider (int): the provider of metadata (ex:'hal') - tool (int): tool used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location - - Returns: - id (int): the origin_metadata unique id - """ - if isinstance(ts, str): - ts = dateutil.parser.parse(ts) - - return db.origin_metadata_add(origin_id, ts, provider, tool, - metadata, cur) - - @db_transaction_generator() - def origin_metadata_get_by(self, origin_id, provider_type=None, db=None, - cur=None): - """Retrieve list of all origin_metadata entries for the origin_id - - Args: - origin_id (int): the unique origin identifier - provider_type (str): (optional) type of provider - - Returns: - list of dicts: the origin_metadata dictionary with the keys: - - - id (int): origin_metadata's id - - origin_id (int): origin's id - - discovery_date (datetime): timestamp of discovery - - tool_id (int): metadata's extracting tool - - metadata (jsonb) - - provider_id (int): metadata's provider - - provider_name (str) - - provider_type (str) - - provider_url (str) - - """ - for line in db.origin_metadata_get_by(origin_id, provider_type, cur): - yield dict(zip(db.origin_metadata_get_cols, line)) - @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys:: tool_name (str): tool's name tool_version (str): tool's version tool_configuration (dict): tool's configuration (free form dict) Returns: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys:: tool_name (str): tool's name tool_version (str): tool's version tool_configuration (dict): tool's configuration (free form dict) Returns: The identifier of the tool if it exists, None otherwise. """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py index 25268bb..004d323 100644 --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -1,101 +1,101 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import SWHRemoteAPI from swh.storage.exc import StorageAPIError class RemoteStorage(SWHRemoteAPI): """Proxy to a remote storage API""" def __init__(self, url, timeout=None): super().__init__( api_exception=StorageAPIError, url=url, timeout=timeout) def check_config(self, *, check_write): return self.post('check_config', {'check_write': check_write}) def content_mimetype_add(self, mimetypes, conflict_update=False): return self.post('content_mimetype/add', { 'mimetypes': mimetypes, 'conflict_update': conflict_update, }) def content_mimetype_missing(self, mimetypes): return self.post('content_mimetype/missing', {'mimetypes': mimetypes}) def content_mimetype_get(self, ids): return self.post('content_mimetype', {'ids': ids}) def content_language_add(self, languages, conflict_update=False): return self.post('content_language/add', { 'languages': languages, 'conflict_update': conflict_update, }) def content_language_missing(self, languages): return self.post('content_language/missing', {'languages': languages}) def content_language_get(self, ids): return self.post('content_language', {'ids': ids}) def content_ctags_add(self, ctags, conflict_update=False): return self.post('content/ctags/add', { 'ctags': ctags, 'conflict_update': conflict_update, }) def content_ctags_missing(self, ctags): return self.post('content/ctags/missing', {'ctags': ctags}) def content_ctags_get(self, ids): return self.post('content/ctags', {'ids': ids}) def content_ctags_search(self, expression, limit=10, last_sha1=None): return self.post('content/ctags/search', { 'expression': expression, 'limit': limit, 'last_sha1': last_sha1, }) def content_fossology_license_add(self, licenses, conflict_update=False): return self.post('content/fossology_license/add', { 'licenses': licenses, 'conflict_update': conflict_update, }) def content_fossology_license_get(self, ids): return self.post('content/fossology_license', {'ids': ids}) - def content_metadata_add(self, metadatas, conflict_update=False): + def content_metadata_add(self, metadata, conflict_update=False): return self.post('content_metadata/add', { - 'metadatas': metadatas, + 'metadata': metadata, 'conflict_update': conflict_update, }) - def content_metadata_missing(self, metadatas): - return self.post('content_metadata/missing', {'metadatas': metadatas}) + def content_metadata_missing(self, metadata): + return self.post('content_metadata/missing', {'metadata': metadata}) def content_metadata_get(self, ids): return self.post('content_metadata', {'ids': ids}) - def revision_metadata_add(self, metadatas, conflict_update=False): + def revision_metadata_add(self, metadata, conflict_update=False): return self.post('revision_metadata/add', { - 'metadatas': metadatas, + 'metadata': metadata, 'conflict_update': conflict_update, }) - def revision_metadata_missing(self, metadatas): - return self.post('revision_metadata/missing', {'metadatas': metadatas}) + def revision_metadata_missing(self, metadata): + return self.post('revision_metadata/missing', {'metadata': metadata}) def revision_metadata_get(self, ids): return self.post('revision_metadata', {'ids': ids}) def indexer_configuration_add(self, tools): return self.post('indexer_configuration/add', {'tools': tools}) def indexer_configuration_get(self, tool): return self.post('indexer_configuration/data', {'tool': tool}) diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py index db7a295..3cf5da1 100644 --- a/swh/indexer/storage/converters.py +++ b/swh/indexer/storage/converters.py @@ -1,140 +1,139 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def ctags_to_db(ctags): """Convert a ctags entry into a ready ctags entry. Args: ctags (dict): ctags entry with the following keys: - id (bytes): content's identifier - tool_id (int): tool id used to compute ctags - ctags ([dict]): List of dictionary with the following keys: - name (str): symbol's name - kind (str): symbol's kind - line (int): symbol's line in the content - language (str): language Returns: list: list of ctags entries as dicts with the following keys: - id (bytes): content's identifier - name (str): symbol's name - kind (str): symbol's kind - language (str): language for that content - tool_id (int): tool id used to compute ctags """ id = ctags['id'] tool_id = ctags['indexer_configuration_id'] for ctag in ctags['ctags']: yield { 'id': id, 'name': ctag['name'], 'kind': ctag['kind'], 'line': ctag['line'], 'lang': ctag['lang'], 'indexer_configuration_id': tool_id, } def db_to_ctags(ctag): """Convert a ctags entry into a ready ctags entry. Args: ctags (dict): ctags entry with the following keys: - id (bytes): content's identifier - ctags ([dict]): List of dictionary with the following keys: - name (str): symbol's name - kind (str): symbol's kind - line (int): symbol's line in the content - language (str): language Returns: List of ctags ready entry (dict with the following keys): - id (bytes): content's identifier - name (str): symbol's name - kind (str): symbol's kind - language (str): language for that content - tool (dict): tool used to compute the ctags """ return { 'id': ctag['id'], 'name': ctag['name'], 'kind': ctag['kind'], 'line': ctag['line'], 'lang': ctag['lang'], 'tool': { 'id': ctag['tool_id'], 'name': ctag['tool_name'], 'version': ctag['tool_version'], 'configuration': ctag['tool_configuration'] } } def db_to_mimetype(mimetype): """Convert a ctags entry into a ready ctags output. """ return { 'id': mimetype['id'], 'encoding': mimetype['encoding'], 'mimetype': mimetype['mimetype'], 'tool': { 'id': mimetype['tool_id'], 'name': mimetype['tool_name'], 'version': mimetype['tool_version'], 'configuration': mimetype['tool_configuration'] } } def db_to_language(language): """Convert a language entry into a ready language output. """ return { 'id': language['id'], 'lang': language['lang'], 'tool': { 'id': language['tool_id'], 'name': language['tool_name'], 'version': language['tool_version'], 'configuration': language['tool_configuration'] } } def db_to_metadata(metadata): """Convert a metadata entry into a ready metadata output. """ return { 'id': metadata['id'], 'translated_metadata': metadata['translated_metadata'], 'tool': { 'id': metadata['tool_id'], 'name': metadata['tool_name'], 'version': metadata['tool_version'], 'configuration': metadata['tool_configuration'] } } def db_to_fossology_license(license): return { - 'id': license['id'], 'licenses': license['licenses'], 'tool': { 'id': license['tool_id'], 'name': license['tool_name'], 'version': license['tool_version'], 'configuration': license['tool_configuration'], } } diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py index b51402e..3c17d78 100644 --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,245 +1,305 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model import hashutil from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes -from swh.storage.db import line_to_bytes +from swh.storage.db import line_to_bytes, execute_values_to_bytes class Db(BaseDb): """Proxy to the SWH Indexer DB, with wrappers around stored procedures """ - @stored_procedure('swh_mktemp_bytea') - def mktemp_bytea(self, cur=None): pass + content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] - def store_tmp_bytea(self, ids, cur=None): - """Store the given identifiers in a new tmp_bytea table""" - cur = self._cursor(cur) - - self.mktemp_bytea(cur) - self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', - ['id'], cur) + def _missing_from_list(self, table, data, hash_keys, cur=None): + """Read from table the data with hash_keys that are missing. - content_mimetype_cols = [ - 'id', 'mimetype', 'encoding', - 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + Args: + table (str): Table name (e.g content_mimetype, content_language, + etc...) + data (dict): Dict of data to read from + hash_keys ([str]): List of keys to read in the data dict. - @stored_procedure('swh_mktemp_content_mimetype_missing') - def mktemp_content_mimetype_missing(self, cur=None): pass + Yields: + The data which is missing from the db. - def content_mimetype_missing_from_temp(self, cur=None): + """ + cur = self._cursor(cur) + keys = ', '.join(hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) for key in hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from %s c + where %s + ) + """ % (keys, keys, table, equality), + (tuple(m[k] for k in hash_keys) for m in data) + ) + + def content_mimetype_missing_from_list(self, mimetypes, cur=None): """List missing mimetypes. """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_mimetype_missing()") - yield from cursor_to_bytes(cur) + yield from self._missing_from_list( + 'content_mimetype', mimetypes, self.content_mimetype_hash_keys, + cur=cur) + + content_mimetype_cols = [ + 'id', 'mimetype', 'encoding', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_mimetype') def mktemp_content_mimetype(self, cur=None): pass def content_mimetype_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", (conflict_update, )) - def content_mimetype_get_from_temp(self, cur=None): + def _convert_key(self, key, main_table='c'): + """Convert keys according to specific use in the module. + Args: + key (str): Key expression to change according to the alias + used in the query + main_table (str): Alias to use for the main table. Default + to c for content_{something}. + + Expected: + Tables content_{something} being aliased as 'c' (something + in {language, mimetype, ...}), table indexer_configuration + being aliased as 'i'. + + """ + if key == 'id': + return '%s.id' % main_table + elif key == 'tool_id': + return 'i.id as tool_id' + elif key == 'licenses': + return ''' + array(select name + from fossology_license + where id = ANY( + array_agg(%s.license_id))) as licenses''' % main_table + return key + + def _get_from_list(self, table, ids, cols, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_mimetype_get()" % ( - ','.join(self.content_mimetype_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join %s c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id; + """ % (', '.join(keys), table), + ((_id,) for _id in ids) + ) + + def content_mimetype_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_mimetype', ids, self.content_mimetype_cols, cur=cur) + + content_language_hash_keys = ['id', 'indexer_configuration_id'] + + def content_language_missing_from_list(self, languages, cur=None): + """List missing languages. + + """ + yield from self._missing_from_list( + 'content_language', languages, self.content_language_hash_keys, + cur=cur) content_language_cols = [ 'id', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_language') def mktemp_content_language(self, cur=None): pass - @stored_procedure('swh_mktemp_content_language_missing') - def mktemp_content_language_missing(self, cur=None): pass - - def content_language_missing_from_temp(self, cur=None): - """List missing languages. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_language_missing()") - yield from cursor_to_bytes(cur) - def content_language_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_language_add(%s)", (conflict_update, )) - def content_language_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_language_get()" % ( - ','.join(self.content_language_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def content_language_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_language', ids, self.content_language_cols, cur=cur) + + content_ctags_hash_keys = ['id', 'indexer_configuration_id'] + + def content_ctags_missing_from_list(self, ctags, cur=None): + """List missing ctags. + + """ + yield from self._missing_from_list( + 'content_ctags', ctags, self.content_ctags_hash_keys, + cur=cur) content_ctags_cols = [ 'id', 'name', 'kind', 'line', 'lang', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_ctags') def mktemp_content_ctags(self, cur=None): pass - @stored_procedure('swh_mktemp_content_ctags_missing') - def mktemp_content_ctags_missing(self, cur=None): pass - - def content_ctags_missing_from_temp(self, cur=None): - """List missing ctags. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_ctags_missing()") - yield from cursor_to_bytes(cur) - def content_ctags_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", (conflict_update, )) - def content_ctags_get_from_temp(self, cur=None): + def content_ctags_get_from_list(self, ids, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_ctags_get()" % ( - ','.join(self.content_ctags_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_ctags_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_ctags c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id + order by line + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) if not last_sha1: query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit)) else: if last_sha1 and isinstance(last_sha1, bytes): last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1) elif last_sha1: last_sha1 = '\\x%s' % last_sha1 query = """SELECT %s FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( ','.join(self.content_ctags_cols)) cur.execute(query, (expression, limit, last_sha1)) yield from cursor_to_bytes(cur) content_fossology_license_cols = [ 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', 'licenses'] @stored_procedure('swh_mktemp_content_fossology_license') def mktemp_content_fossology_license(self, cur=None): pass def content_fossology_license_add_from_temp(self, conflict_update, cur=None): """Add new licenses per content. """ self._cursor(cur).execute( "SELECT swh_content_fossology_license_add(%s)", (conflict_update, )) - def content_fossology_license_get_from_temp(self, cur=None): - """Retrieve licenses per content. + def content_fossology_license_get_from_list(self, ids, cur=None): + """Retrieve licenses per id. """ cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_fossology_license_get()" % ( - ','.join(self.content_fossology_license_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_fossology_license_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_fossology_license c on t.id=c.id + inner join indexer_configuration i + on i.id=c.indexer_configuration_id + group by c.id, i.id, i.tool_name, i.tool_version, + i.tool_configuration; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) + + content_metadata_hash_keys = ['id', 'indexer_configuration_id'] + + def content_metadata_missing_from_list(self, metadata, cur=None): + """List missing metadata. + + """ + yield from self._missing_from_list( + 'content_metadata', metadata, self.content_metadata_hash_keys, + cur=cur) content_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_metadata') def mktemp_content_metadata(self, cur=None): pass - @stored_procedure('swh_mktemp_content_metadata_missing') - def mktemp_content_metadata_missing(self, cur=None): pass - - def content_metadata_missing_from_temp(self, cur=None): - """List missing metadatas. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_metadata_missing()") - yield from cursor_to_bytes(cur) - def content_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", (conflict_update, )) - def content_metadata_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_metadata_get()" % ( - ','.join(self.content_metadata_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def content_metadata_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_metadata', ids, self.content_metadata_cols, cur=cur) + + revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] + + def revision_metadata_missing_from_list(self, metadata, cur=None): + """List missing metadata. + + """ + yield from self._missing_from_list( + 'revision_metadata', metadata, self.revision_metadata_hash_keys, + cur=cur) revision_metadata_cols = [ 'id', 'translated_metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_revision_metadata') def mktemp_revision_metadata(self, cur=None): pass - @stored_procedure('swh_mktemp_revision_metadata_missing') - def mktemp_revision_metadata_missing(self, cur=None): pass - - def revision_metadata_missing_from_temp(self, cur=None): - """List missing metadatas. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_revision_metadata_missing()") - yield from cursor_to_bytes(cur) - def revision_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", (conflict_update, )) - def revision_metadata_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_revision_metadata_get()" % ( - ','.join(self.revision_metadata_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def revision_metadata_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_indexer_configuration') def mktemp_indexer_configuration(self, cur=None): pass def indexer_configuration_add_from_temp(self, cur=None): cur = self._cursor(cur) cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( ','.join(self.indexer_configuration_cols), )) yield from cursor_to_bytes(cur) def indexer_configuration_get(self, tool_name, tool_version, tool_configuration, cur=None): cur = self._cursor(cur) cur.execute('''select %s from indexer_configuration where tool_name=%%s and tool_version=%%s and tool_configuration=%%s''' % ( ','.join(self.indexer_configuration_cols)), (tool_name, tool_version, tool_configuration)) data = cur.fetchone() if not data: return None return line_to_bytes(data) diff --git a/swh/indexer/tests/storage/test_converters.py b/swh/indexer/tests/storage/test_converters.py index 89946d4..a5d6bfa 100644 --- a/swh/indexer/tests/storage/test_converters.py +++ b/swh/indexer/tests/storage/test_converters.py @@ -1,199 +1,198 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from nose.plugins.attrib import attr from swh.indexer.storage import converters @attr('!db') class TestConverters(unittest.TestCase): def setUp(self): self.maxDiff = None @istest def ctags_to_db(self): input_ctag = { 'id': b'some-id', 'indexer_configuration_id': 100, 'ctags': [ { 'name': 'some-name', 'kind': 'some-kind', 'line': 10, 'lang': 'Yaml', }, { 'name': 'main', 'kind': 'function', 'line': 12, 'lang': 'Yaml', }, ] } expected_ctags = [ { 'id': b'some-id', 'name': 'some-name', 'kind': 'some-kind', 'line': 10, 'lang': 'Yaml', 'indexer_configuration_id': 100, }, { 'id': b'some-id', 'name': 'main', 'kind': 'function', 'line': 12, 'lang': 'Yaml', 'indexer_configuration_id': 100, }] # when actual_ctags = list(converters.ctags_to_db(input_ctag)) # then self.assertEquals(actual_ctags, expected_ctags) @istest def db_to_ctags(self): input_ctags = { 'id': b'some-id', 'name': 'some-name', 'kind': 'some-kind', 'line': 10, 'lang': 'Yaml', 'tool_id': 200, 'tool_name': 'some-toolname', 'tool_version': 'some-toolversion', 'tool_configuration': {} } expected_ctags = { 'id': b'some-id', 'name': 'some-name', 'kind': 'some-kind', 'line': 10, 'lang': 'Yaml', 'tool': { 'id': 200, 'name': 'some-toolname', 'version': 'some-toolversion', 'configuration': {}, } } # when actual_ctags = converters.db_to_ctags(input_ctags) # then self.assertEquals(actual_ctags, expected_ctags) @istest def db_to_mimetype(self): input_mimetype = { 'id': b'some-id', 'tool_id': 10, 'tool_name': 'some-toolname', 'tool_version': 'some-toolversion', 'tool_configuration': {}, 'encoding': b'ascii', 'mimetype': b'text/plain', } expected_mimetype = { 'id': b'some-id', 'encoding': b'ascii', 'mimetype': b'text/plain', 'tool': { 'id': 10, 'name': 'some-toolname', 'version': 'some-toolversion', 'configuration': {}, } } actual_mimetype = converters.db_to_mimetype(input_mimetype) self.assertEquals(actual_mimetype, expected_mimetype) @istest def db_to_language(self): input_language = { 'id': b'some-id', 'tool_id': 20, 'tool_name': 'some-toolname', 'tool_version': 'some-toolversion', 'tool_configuration': {}, 'lang': b'css', } expected_language = { 'id': b'some-id', 'lang': b'css', 'tool': { 'id': 20, 'name': 'some-toolname', 'version': 'some-toolversion', 'configuration': {}, } } actual_language = converters.db_to_language(input_language) self.assertEquals(actual_language, expected_language) @istest def db_to_fossology_license(self): input_license = { 'id': b'some-id', 'tool_id': 20, 'tool_name': 'nomossa', 'tool_version': '5.22', 'tool_configuration': {}, 'licenses': ['GPL2.0'], } expected_license = { - 'id': b'some-id', 'licenses': ['GPL2.0'], 'tool': { 'id': 20, 'name': 'nomossa', 'version': '5.22', 'configuration': {}, } } actual_license = converters.db_to_fossology_license(input_license) self.assertEquals(actual_license, expected_license) @istest def db_to_metadata(self): input_metadata = { 'id': b'some-id', 'tool_id': 20, 'tool_name': 'some-toolname', 'tool_version': 'some-toolversion', 'tool_configuration': {}, 'translated_metadata': b'translated_metadata', } expected_metadata = { 'id': b'some-id', 'translated_metadata': b'translated_metadata', 'tool': { 'id': 20, 'name': 'some-toolname', 'version': 'some-toolversion', 'configuration': {}, } } actual_metadata = converters.db_to_metadata(input_metadata) self.assertEquals(actual_metadata, expected_metadata) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index 040cd36..7b97b61 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,1478 +1,1487 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pathlib import unittest from nose.tools import istest from nose.plugins.attrib import attr from swh.model.hashutil import hash_to_bytes from swh.indexer.storage import get_indexer_storage from swh.core.tests.db_testing import DbTestFixture PATH_TO_STORAGE_TEST_DATA = '../../../../../swh-storage-testdata' class StorageTestFixture: """Mix this in a test subject class to get Storage testing support. This fixture requires to come before DbTestFixture in the inheritance list as it uses its methods to setup its own internal database. Usage example: class TestStorage(StorageTestFixture, DbTestFixture): ... """ TEST_STORAGE_DB_NAME = 'softwareheritage-test-indexer' @classmethod def setUpClass(cls): if not hasattr(cls, 'DB_TEST_FIXTURE_IMPORTED'): raise RuntimeError("StorageTestFixture needs to be followed by " "DbTestFixture in the inheritance list.") test_dir = pathlib.Path(__file__).absolute().parent test_data_dir = test_dir / PATH_TO_STORAGE_TEST_DATA test_db_dump = (test_data_dir / 'dumps/swh-indexer.dump').absolute() cls.add_db(cls.TEST_STORAGE_DB_NAME, str(test_db_dump), 'pg_dump') super().setUpClass() def setUp(self): super().setUp() self.storage_config = { 'cls': 'local', 'args': { 'db': 'dbname=%s' % self.TEST_STORAGE_DB_NAME, }, } self.storage = get_indexer_storage(**self.storage_config) def tearDown(self): self.storage = None super().tearDown() def reset_storage_tables(self): excluded = {'indexer_configuration'} self.reset_db_tables(self.TEST_STORAGE_DB_NAME, excluded=excluded) db = self.test_db[self.TEST_STORAGE_DB_NAME] db.conn.commit() @attr('db') class BaseTestStorage(StorageTestFixture, DbTestFixture): def setUp(self): super().setUp() self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') self.revision_id_1 = hash_to_bytes( '7026b7c1a2af56521e951c01ed20f255fa054238') self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') cur = self.test_db[self.TEST_STORAGE_DB_NAME].cursor tools = {} cur.execute(''' select tool_name, id, tool_version, tool_configuration from indexer_configuration order by id''') for row in cur.fetchall(): key = row[0] while key in tools: key = '_' + key tools[key] = { 'id': row[1], 'name': row[0], 'version': row[2], 'configuration': row[3] } self.tools = tools def tearDown(self): self.reset_storage_tables() super().tearDown() @attr('db') class CommonTestStorage(BaseTestStorage): """Base class for Indexer Storage testing. """ @istest def check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) self.assertTrue(self.storage.check_config(check_write=False)) @istest def content_mimetype_missing(self): # given tool_id = self.tools['file']['id'] mimetypes = [ { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }] # when actual_missing = self.storage.content_mimetype_missing(mimetypes) # then self.assertEqual(list(actual_missing), [ self.sha1_1, self.sha1_2, ]) # given self.storage.content_mimetype_add([{ 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'indexer_configuration_id': tool_id, }]) # when actual_missing = self.storage.content_mimetype_missing(mimetypes) # then self.assertEqual(list(actual_missing), [self.sha1_1]) @istest def content_mimetype_add__drop_duplicate(self): # given tool_id = self.tools['file']['id'] mimetype_v1 = { 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'indexer_configuration_id': tool_id, } # given self.storage.content_mimetype_add([mimetype_v1]) # when actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) # then expected_mimetypes_v1 = [{ 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'tool': self.tools['file'], }] self.assertEqual(actual_mimetypes, expected_mimetypes_v1) # given mimetype_v2 = mimetype_v1.copy() mimetype_v2.update({ 'mimetype': b'text/html', 'encoding': b'us-ascii', }) self.storage.content_mimetype_add([mimetype_v2]) actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) # mimetype did not change as the v2 was dropped. self.assertEqual(actual_mimetypes, expected_mimetypes_v1) @istest def content_mimetype_add__update_in_place_duplicate(self): # given tool_id = self.tools['file']['id'] mimetype_v1 = { 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'indexer_configuration_id': tool_id, } # given self.storage.content_mimetype_add([mimetype_v1]) # when actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) expected_mimetypes_v1 = [{ 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'tool': self.tools['file'], }] # then self.assertEqual(actual_mimetypes, expected_mimetypes_v1) # given mimetype_v2 = mimetype_v1.copy() mimetype_v2.update({ 'mimetype': b'text/html', 'encoding': b'us-ascii', }) self.storage.content_mimetype_add([mimetype_v2], conflict_update=True) actual_mimetypes = list(self.storage.content_mimetype_get( [self.sha1_2])) expected_mimetypes_v2 = [{ 'id': self.sha1_2, 'mimetype': b'text/html', 'encoding': b'us-ascii', 'tool': { 'id': 2, 'name': 'file', 'version': '5.22', 'configuration': {'command_line': 'file --mime '} } }] # mimetype did change as the v2 was used to overwrite v1 self.assertEqual(actual_mimetypes, expected_mimetypes_v2) @istest def content_mimetype_get(self): # given tool_id = self.tools['file']['id'] mimetypes = [self.sha1_2, self.sha1_1] mimetype1 = { 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'indexer_configuration_id': tool_id, } # when self.storage.content_mimetype_add([mimetype1]) # then actual_mimetypes = list(self.storage.content_mimetype_get(mimetypes)) # then expected_mimetypes = [{ 'id': self.sha1_2, 'mimetype': b'text/plain', 'encoding': b'utf-8', 'tool': self.tools['file'] }] self.assertEqual(actual_mimetypes, expected_mimetypes) @istest def content_language_missing(self): # given tool_id = self.tools['pygments']['id'] languages = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, } ] # when actual_missing = list(self.storage.content_language_missing(languages)) # then self.assertEqual(list(actual_missing), [ self.sha1_2, self.sha1_1, ]) # given self.storage.content_language_add([{ 'id': self.sha1_2, 'lang': 'haskell', 'indexer_configuration_id': tool_id, }]) # when actual_missing = list(self.storage.content_language_missing(languages)) # then self.assertEqual(actual_missing, [self.sha1_1]) @istest def content_language_get(self): # given tool_id = self.tools['pygments']['id'] language1 = { 'id': self.sha1_2, 'lang': 'common-lisp', 'indexer_configuration_id': tool_id, } # when self.storage.content_language_add([language1]) # then actual_languages = list(self.storage.content_language_get( [self.sha1_2, self.sha1_1])) # then expected_languages = [{ 'id': self.sha1_2, 'lang': 'common-lisp', 'tool': self.tools['pygments'] }] self.assertEqual(actual_languages, expected_languages) @istest def content_language_add__drop_duplicate(self): # given tool_id = self.tools['pygments']['id'] language_v1 = { 'id': self.sha1_2, 'lang': 'emacslisp', 'indexer_configuration_id': tool_id, } # given self.storage.content_language_add([language_v1]) # when actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # then expected_languages_v1 = [{ 'id': self.sha1_2, 'lang': 'emacslisp', 'tool': self.tools['pygments'] }] self.assertEqual(actual_languages, expected_languages_v1) # given language_v2 = language_v1.copy() language_v2.update({ 'lang': 'common-lisp', }) self.storage.content_language_add([language_v2]) actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # language did not change as the v2 was dropped. self.assertEqual(actual_languages, expected_languages_v1) @istest def content_language_add__update_in_place_duplicate(self): # given tool_id = self.tools['pygments']['id'] language_v1 = { 'id': self.sha1_2, 'lang': 'common-lisp', 'indexer_configuration_id': tool_id, } # given self.storage.content_language_add([language_v1]) # when actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # then expected_languages_v1 = [{ 'id': self.sha1_2, 'lang': 'common-lisp', 'tool': self.tools['pygments'] }] self.assertEqual(actual_languages, expected_languages_v1) # given language_v2 = language_v1.copy() language_v2.update({ 'lang': 'emacslisp', }) self.storage.content_language_add([language_v2], conflict_update=True) actual_languages = list(self.storage.content_language_get( [self.sha1_2])) # language did not change as the v2 was dropped. expected_languages_v2 = [{ 'id': self.sha1_2, 'lang': 'emacslisp', 'tool': self.tools['pygments'] }] # language did change as the v2 was used to overwrite v1 self.assertEqual(actual_languages, expected_languages_v2) @istest def content_ctags_missing(self): # given tool_id = self.tools['universal-ctags']['id'] ctags = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, } ] # when actual_missing = self.storage.content_ctags_missing(ctags) # then self.assertEqual(list(actual_missing), [ self.sha1_2, self.sha1_1 ]) # given self.storage.content_ctags_add([ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 119, 'lang': 'OCaml', }] }, ]) # when actual_missing = self.storage.content_ctags_missing(ctags) # then self.assertEqual(list(actual_missing), [self.sha1_1]) @istest def content_ctags_get(self): # given tool_id = self.tools['universal-ctags']['id'] ctags = [self.sha1_2, self.sha1_1] ctag1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', }] } # when self.storage.content_ctags_add([ctag1]) # then actual_ctags = list(self.storage.content_ctags_get(ctags)) # then expected_ctags = [ { 'id': self.sha1_2, 'tool': self.tools['universal-ctags'], 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'id': self.sha1_2, 'tool': self.tools['universal-ctags'], 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', } ] self.assertEqual(actual_ctags, expected_ctags) @istest def content_ctags_search(self): # 1. given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag1 = { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }, ] } ctag2 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, ] } self.storage.content_ctags_add([ctag1, ctag2]) # 1. when actual_ctags = list(self.storage.content_ctags_search('hello', limit=1)) # 1. then self.assertEqual(actual_ctags, [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', } ]) # 2. when actual_ctags = list(self.storage.content_ctags_search( 'hello', limit=1, last_sha1=ctag1['id'])) # 2. then self.assertEqual(actual_ctags, [ { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', } ]) # 3. when actual_ctags = list(self.storage.content_ctags_search('hello')) # 3. then self.assertEqual(actual_ctags, [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, ]) # 4. when actual_ctags = list(self.storage.content_ctags_search('counter')) # then self.assertEqual(actual_ctags, [{ 'id': ctag1['id'], 'tool': tool, 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }]) @istest def content_ctags_search_no_result(self): actual_ctags = list(self.storage.content_ctags_search('counter')) self.assertEquals(actual_ctags, []) @istest def content_ctags_add__add_new_ctags_added(self): # given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given self.storage.content_ctags_add([ctag_v1]) self.storage.content_ctags_add([ctag_v1]) # conflict does nothing # when actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # then expected_ctags = [{ 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }] self.assertEqual(actual_ctags, expected_ctags) # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) self.storage.content_ctags_add([ctag_v2]) expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': self.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) self.assertEqual(actual_ctags, expected_ctags) @istest def content_ctags_add__update_in_place(self): # given tool = self.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given self.storage.content_ctags_add([ctag_v1]) # when actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # then expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool } ] self.assertEqual(actual_ctags, expected_ctags) # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }, { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) self.storage.content_ctags_add([ctag_v2], conflict_update=True) actual_ctags = list(self.storage.content_ctags_get( [self.sha1_2])) # ctag did change as the v2 was used to overwrite v1 expected_ctags = [ { 'id': self.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': self.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] self.assertEqual(actual_ctags, expected_ctags) @istest def content_fossology_license_get(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license1 = { 'id': self.sha1_1, 'licenses': ['GPL-2.0+'], 'indexer_configuration_id': tool_id, } # when self.storage.content_fossology_license_add([license1]) # then actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_2, self.sha1_1])) expected_license = { - 'id': self.sha1_1, - 'licenses': ['GPL-2.0+'], - 'tool': tool, + self.sha1_1: [{ + 'licenses': ['GPL-2.0+'], + 'tool': tool, + }] } # then self.assertEqual(actual_licenses, [expected_license]) @istest def content_fossology_license_add__new_license_added(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': self.sha1_1, 'licenses': ['Apache-2.0'], 'indexer_configuration_id': tool_id, } # given self.storage.content_fossology_license_add([license_v1]) # conflict does nothing self.storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # then expected_license = { - 'id': self.sha1_1, - 'licenses': ['Apache-2.0'], - 'tool': tool, + self.sha1_1: [{ + 'licenses': ['Apache-2.0'], + 'tool': tool, + }] } self.assertEqual(actual_licenses, [expected_license]) # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['BSD-2-Clause'], }) self.storage.content_fossology_license_add([license_v2]) actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) - expected_license.update({ - 'licenses': ['Apache-2.0', 'BSD-2-Clause'], - }) + expected_license = { + self.sha1_1: [{ + 'licenses': ['Apache-2.0', 'BSD-2-Clause'], + 'tool': tool + }] + } # license did not change as the v2 was dropped. self.assertEqual(actual_licenses, [expected_license]) @istest def content_fossology_license_add__update_in_place_duplicate(self): # given tool = self.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': self.sha1_1, 'licenses': ['CECILL'], 'indexer_configuration_id': tool_id, } # given self.storage.content_fossology_license_add([license_v1]) # conflict does nothing self.storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # then expected_license = { - 'id': self.sha1_1, - 'licenses': ['CECILL'], - 'tool': tool, + self.sha1_1: [{ + 'licenses': ['CECILL'], + 'tool': tool, + }] } self.assertEqual(actual_licenses, [expected_license]) # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['CECILL-2.0'] }) self.storage.content_fossology_license_add([license_v2], conflict_update=True) actual_licenses = list(self.storage.content_fossology_license_get( [self.sha1_1])) # license did change as the v2 was used to overwrite v1 - expected_license.update({ - 'licenses': ['CECILL-2.0'] - }) + expected_license = { + self.sha1_1: [{ + 'licenses': ['CECILL-2.0'], + 'tool': tool, + }] + } self.assertEqual(actual_licenses, [expected_license]) @istest def content_metadata_missing(self): # given tool_id = self.tools['swh-metadata-translator']['id'] - metadatas = [ + metadata = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, }, { 'id': self.sha1_1, 'indexer_configuration_id': tool_id, } ] # when - actual_missing = list(self.storage.content_metadata_missing(metadatas)) + actual_missing = list(self.storage.content_metadata_missing(metadata)) # then self.assertEqual(list(actual_missing), [ self.sha1_2, self.sha1_1, ]) # given self.storage.content_metadata_add([{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id }]) # when - actual_missing = list(self.storage.content_metadata_missing(metadatas)) + actual_missing = list(self.storage.content_metadata_missing(metadata)) # then self.assertEqual(actual_missing, [self.sha1_1]) @istest def content_metadata_get(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata1 = { 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id, } # when self.storage.content_metadata_add([metadata1]) # then - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2, self.sha1_1])) - expected_metadatas = [{ + expected_metadata = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] - self.assertEqual(actual_metadatas, expected_metadatas) + self.assertEqual(actual_metadata, expected_metadata) @istest def content_metadata_add_drop_duplicate(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata_v1 = { 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id, } # given self.storage.content_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'other': {}, 'name': 'test_drop_duplicated_metadata', 'version': '0.0.1' }, }) self.storage.content_metadata_add([metadata_v2]) # then - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # metadata did not change as the v2 was dropped. - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) @istest def content_metadata_add_update_in_place_duplicate(self): # given tool_id = self.tools['swh-metadata-translator']['id'] metadata_v1 = { 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'indexer_configuration_id': tool_id, } # given self.storage.content_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # then - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'other': {}, 'name': 'test_update_duplicated_metadata', 'version': '0.0.1' }, }) self.storage.content_metadata_add([metadata_v2], conflict_update=True) - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # language did not change as the v2 was dropped. - expected_metadatas_v2 = [{ + expected_metadata_v2 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, 'name': 'test_update_duplicated_metadata', 'version': '0.0.1' }, 'tool': self.tools['swh-metadata-translator'] }] # metadata did change as the v2 was used to overwrite v1 - self.assertEqual(actual_metadatas, expected_metadatas_v2) + self.assertEqual(actual_metadata, expected_metadata_v2) @istest def revision_metadata_missing(self): # given tool_id = self.tools['swh-metadata-detector']['id'] - metadatas = [ + metadata = [ { 'id': self.revision_id_1, 'indexer_configuration_id': tool_id, }, { 'id': self.revision_id_2, 'indexer_configuration_id': tool_id, } ] # when actual_missing = list(self.storage.revision_metadata_missing( - metadatas)) + metadata)) # then self.assertEqual(list(actual_missing), [ self.revision_id_1, self.revision_id_2, ]) # given self.storage.revision_metadata_add([{ 'id': self.revision_id_1, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'type': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id }]) # when actual_missing = list(self.storage.revision_metadata_missing( - metadatas)) + metadata)) # then self.assertEqual(actual_missing, [self.revision_id_2]) @istest def revision_metadata_get(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_rev = { 'id': self.revision_id_2, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'type': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id } # when self.storage.revision_metadata_add([metadata_rev]) # then - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2, self.revision_id_1])) - expected_metadatas = [{ + expected_metadata = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_rev['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] - self.assertEqual(actual_metadatas, expected_metadatas) + self.assertEqual(actual_metadata, expected_metadata) @istest def revision_metadata_add_drop_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'id': self.revision_id_1, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'type': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id, } # given self.storage.revision_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_1])) - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.revision_id_1, 'translated_metadata': metadata_v1['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'name': 'test_metadata', 'author': 'MG', }, }) self.storage.revision_metadata_add([metadata_v2]) # then - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_1])) # metadata did not change as the v2 was dropped. - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) @istest def revision_metadata_add_update_in_place_duplicate(self): # given tool_id = self.tools['swh-metadata-detector']['id'] metadata_v1 = { 'id': self.revision_id_2, 'translated_metadata': { 'developmentStatus': None, 'version': None, 'operatingSystem': None, 'description': None, 'keywords': None, 'issueTracker': None, 'name': None, 'author': None, 'relatedLink': None, 'url': None, 'type': None, 'license': None, 'maintainer': None, 'email': None, 'softwareRequirements': None, 'identifier': None }, 'indexer_configuration_id': tool_id, } # given self.storage.revision_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) # then - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v1['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'translated_metadata': { 'name': 'test_update_duplicated_metadata', 'author': 'MG' }, }) self.storage.revision_metadata_add([metadata_v2], conflict_update=True) - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) # language did not change as the v2 was dropped. - expected_metadatas_v2 = [{ + expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] # metadata did change as the v2 was used to overwrite v1 - self.assertEqual(actual_metadatas, expected_metadatas_v2) + self.assertEqual(actual_metadata, expected_metadata_v2) @istest def indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) # does not exist # add it actual_tools = list(self.storage.indexer_configuration_add([tool])) self.assertEquals(len(actual_tools), 1) actual_tool = actual_tools[0] self.assertIsNotNone(actual_tool) # now it exists new_id = actual_tool.pop('id') self.assertEquals(actual_tool, tool) actual_tools2 = list(self.storage.indexer_configuration_add([tool])) actual_tool2 = actual_tools2[0] self.assertIsNotNone(actual_tool2) # now it exists new_id2 = actual_tool2.pop('id') self.assertEqual(new_id, new_id2) self.assertEqual(actual_tool, actual_tool2) @istest def indexer_configuration_add_multiple(self): tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tools = list(self.storage.indexer_configuration_add([tool])) self.assertEqual(len(actual_tools), 1) new_tools = [tool, { 'tool_name': 'yet-another-tool', 'tool_version': 'version', 'tool_configuration': {}, }] actual_tools = list(self.storage.indexer_configuration_add(new_tools)) self.assertEqual(len(actual_tools), 2) # order not guaranteed, so we iterate over results to check for tool in actual_tools: _id = tool.pop('id') self.assertIsNotNone(_id) self.assertIn(tool, new_tools) @istest def indexer_configuration_get_missing(self): tool = { 'tool_name': 'unknown-tool', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) @istest def indexer_configuration_get(self): tool = { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = self.storage.indexer_configuration_get(tool) expected_tool = tool.copy() expected_tool['id'] = 1 self.assertEqual(expected_tool, actual_tool) @istest def indexer_configuration_metadata_get_missing_context(self): tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"context": "unknown-context"}, } actual_tool = self.storage.indexer_configuration_get(tool) self.assertIsNone(actual_tool) @istest def indexer_configuration_metadata_get(self): tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "npm"}, } actual_tool = self.storage.indexer_configuration_get(tool) expected_tool = tool.copy() expected_tool['id'] = actual_tool['id'] self.assertEqual(expected_tool, actual_tool) class IndexerTestStorage(CommonTestStorage, unittest.TestCase): """Running the tests locally. For the client api tests (remote storage), see `class`:swh.indexer.storage.test_api_client:TestRemoteStorage class. """ pass diff --git a/version.txt b/version.txt index 35a77ee..43e891d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.51-0-ga4f7af3 \ No newline at end of file +v0.0.52-0-gda92de4 \ No newline at end of file