diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -38,39 +38,6 @@ end $$; --- create a temporary table for content_ctags tmp_content_mimetype_missing, -create or replace function swh_mktemp_content_mimetype_missing() - returns void - language sql -as $$ - create temporary table tmp_content_mimetype_missing ( - id sha1, - indexer_configuration_id bigint - ) on commit drop; -$$; - -comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information'; - --- check which entries of tmp_bytea are missing from content_mimetype --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_mimetype_missing as tmp - where not exists - (select 1 from content_mimetype as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id)); - return; -end -$$; - -comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information'; - -- create a temporary table for content_mimetype tmp_content_mimetype, create or replace function swh_mktemp_content_mimetype() returns void @@ -118,70 +85,6 @@ comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; -create type content_mimetype_signature as( - id sha1, - mimetype bytea, - encoding bytea, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content mimetype from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_get() - returns setof content_mimetype_signature - language plpgsql -as $$ -begin - return query - select c.id, mimetype, encoding, - i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_mimetype c on c.id=t.id - inner join indexer_configuration i on c.indexer_configuration_id=i.id; - return; -end -$$; - -comment on function swh_content_mimetype_get() IS 'List content''s mimetypes'; - --- create a temporary table for content_language tmp_content_language, -create or replace function swh_mktemp_content_language_missing() - returns void - language sql -as $$ - create temporary table tmp_content_language_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language'; - --- check which entries of tmp_bytea are missing from content_language --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_language_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_content_language_missing as tmp - where not exists - (select 1 from content_language as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_content_language_missing() IS 'Filter missing content languages'; - -- add tmp_content_language entries to content_language, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- @@ -228,34 +131,6 @@ comment on function swh_mktemp_content_language() is 'Helper table to add content language'; -create type content_language_signature as ( - id sha1, - lang languages, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content language from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_language_get() - returns setof content_language_signature - language plpgsql -as $$ -begin - return query - select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_language c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - -comment on function swh_content_language_get() is 'List content''s language'; - -- create a temporary table for content_ctags tmp_content_ctags, create or replace function swh_mktemp_content_ctags() @@ -298,40 +173,6 @@ comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; --- create a temporary table for content_ctags missing routine -create or replace function swh_mktemp_content_ctags_missing() - returns void - language sql -as $$ - create temporary table tmp_content_ctags_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags'; - --- check which entries of tmp_bytea are missing from content_ctags --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_ctags_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_ctags_missing as tmp - where not exists - (select 1 from content_ctags as c - where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id - limit 1)); - return; -end -$$; - -comment on function swh_content_ctags_missing() IS 'Filter missing content ctags'; - create type content_ctags_signature as ( id sha1, name text, @@ -344,27 +185,6 @@ tool_configuration jsonb ); --- Retrieve list of content ctags from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_ctags_get() - returns setof content_ctags_signature - language plpgsql -as $$ -begin - return query - select c.id, c.name, c.kind, c.line, c.lang, - i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration - from tmp_bytea t - inner join content_ctags c using(id) - inner join indexer_configuration i on i.id = c.indexer_configuration_id - order by line; - return; -end -$$; - -comment on function swh_content_ctags_get() IS 'List content ctags'; - -- Search within ctags content. -- create or replace function swh_content_ctags_search( @@ -440,77 +260,7 @@ comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; -create type content_fossology_license_signature as ( - id sha1, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb, - licenses text[] -); - --- Retrieve list of content license from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_fossology_license_get() - returns setof content_fossology_license_signature - language plpgsql -as $$ -begin - return query - select cl.id, - ic.id as tool_id, - ic.tool_name, - ic.tool_version, - ic.tool_configuration, - array(select name - from fossology_license - where id = ANY(array_agg(cl.license_id))) as licenses - from tmp_bytea tcl - inner join content_fossology_license cl using(id) - inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id - group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration; - return; -end -$$; - -comment on function swh_content_fossology_license_get() IS 'List content licenses'; - -- content_metadata functions --- --- create a temporary table for content_metadata tmp_content_metadata, -create or replace function swh_mktemp_content_metadata_missing() - returns void - language sql -as $$ - create temporary table tmp_content_metadata_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; - --- check which entries of tmp_bytea are missing from content_metadata --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_metadata_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_content_metadata_missing as tmp - where not exists - (select 1 from content_metadata as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; -- add tmp_content_metadata entries to content_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. @@ -558,71 +308,8 @@ comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; --- -create type content_metadata_signature as ( - id sha1, - translated_metadata jsonb, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content metadata from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_metadata_get() - returns setof content_metadata_signature - language plpgsql -as $$ -begin - return query - select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_metadata c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - -comment on function swh_content_metadata_get() is 'List content''s metadata'; -- end content_metadata functions --- revision_metadata functions --- --- create a temporary table for revision_metadata tmp_revision_metadata, -create or replace function swh_mktemp_revision_metadata_missing() - returns void - language sql -as $$ - create temporary table tmp_revision_metadata_missing ( - id sha1_git, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; - --- check which entries of tmp_bytea are missing from revision_metadata --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_revision_metadata_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_revision_metadata_missing as tmp - where not exists - (select 1 from revision_metadata as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; - -- add tmp_revision_metadata entries to revision_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- @@ -669,33 +356,6 @@ comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; --- -create type revision_metadata_signature as ( - id sha1_git, - translated_metadata jsonb, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of revision metadata from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_revision_metadata_get() - returns setof revision_metadata_signature - language plpgsql -as $$ -begin - return query - select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join revision_metadata c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - create or replace function swh_mktemp_indexer_configuration() returns void language sql diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(114, now(), 'Work In Progress'); + values(115, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/115.sql @@ -0,0 +1,40 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 114 +-- to_version: 115 +-- description: Remove temporary table use in reading api + +insert into dbversion(version, release, description) +values(115, now(), 'Work In Progress'); + +drop function swh_mktemp_content_mimetype_missing(); +drop function swh_content_mimetype_missing(); + +drop function swh_content_mimetype_get(); +drop type content_mimetype_signature; + +drop function swh_mktemp_content_language_missing(); +drop function swh_content_language_missing(); + +drop function swh_content_language_get(); +drop type content_language_signature; + +drop function swh_mktemp_content_ctags_missing(); +drop function swh_content_ctags_missing(); + +drop function swh_content_ctags_get(); +--drop type content_ctags_signature; -- still used in swh_content_ctags_search + +drop function swh_content_fossology_license_get(); +drop type content_fossology_license_signature; + +drop function swh_mktemp_content_metadata_missing(); +drop function swh_content_metadata_missing(); + +drop function swh_content_metadata_get(); +drop type content_metadata_signature; + +drop function swh_mktemp_revision_metadata_missing(); +drop function swh_revision_metadata_missing(); + +drop function swh_revision_metadata_get(); +drop type revision_metadata_signature; diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -95,20 +95,16 @@ Args: mimetypes (iterable): iterable of dict with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: an iterable of missing id for the triplets id, tool_name, - tool_version + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_mimetype_missing(cur) - db.copy_to(mimetypes, 'tmp_content_mimetype_missing', - ['id', 'indexer_configuration_id'], - cur) - for obj in db.content_mimetype_missing_from_temp(cur): + for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] @db_transaction() @@ -119,13 +115,14 @@ Args: mimetypes (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - mimetype (bytes): raw content's mimetype - - encoding (bytes): raw content's encoding - - indexer_configuration_id (int): tool's id used to - compute the results - - conflict_update: Flag to determine if we want to - overwrite (true) or skip duplicates (false, the default) + id (bytes): sha1 identifier + mimetype (bytes): raw content's mimetype + encoding (bytes): raw content's encoding + indexer_configuration_id (int): tool's id used to + compute the results + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates + (false, the default) """ db.mktemp_content_mimetype(cur) @@ -136,8 +133,21 @@ @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_mimetype_get_from_temp(): + """Retrieve full content mimetype per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + mimetypes (iterable): dictionaries with keys: + + id (bytes): sha1 identifier + mimetype (bytes): raw content's mimetype + encoding (bytes): raw content's encoding + tool (dict): Tool used to compute the language + + """ + for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @@ -148,24 +158,34 @@ Args: languages (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: identifiers of missing languages + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_language_missing(cur) - db.copy_to(languages, 'tmp_content_language_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.content_language_missing_from_temp(cur): + for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_language_get_from_temp(): + """Retrieve full content language per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + languages (iterable): dictionaries with keys: + + id (bytes): sha1 identifier + lang (bytes): raw content's language + tool (dict): Tool used to compute the language + + """ + for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @@ -177,11 +197,12 @@ Args: languages (iterable): dictionaries with keys: - - id: sha1 - - lang: bytes + id (bytes): sha1 + lang (bytes): language detected - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates (false, the + default) """ db.mktemp_content_language(cur) @@ -204,20 +225,16 @@ Args: ctags (iterable): dicts with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool name used - - tool_version (str): associated version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - an iterable of missing id + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_ctags_missing(cur) - db.copy_to(ctags, - tblname='tmp_content_ctags_missing', - columns=['id', 'indexer_configuration_id'], - cur=cur) - for obj in db.content_ctags_missing_from_temp(cur): + for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @db_transaction_generator() @@ -227,9 +244,18 @@ Args: ids (iterable): sha1 checksums + Yields: + Dictionaries with keys: + + id (bytes): content's identifier + name (str): symbol's name + kind (str): symbol's kind + language (str): language for that content + tool (dict): tool used to compute the ctags' info + + """ - db.store_tmp_bytea(ids, cur) - for c in db.content_ctags_get_from_temp(): + for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @db_transaction() @@ -240,9 +266,9 @@ Args: ctags (iterable): dictionaries with keys: - - id (bytes): sha1 - - ctags ([list): List of dictionary with keys: name, kind, - line, language + id (bytes): sha1 + ctags ([list): List of dictionary with keys: name, kind, + line, language """ def _convert_ctags(__ctags): @@ -289,13 +315,12 @@ Yields: list: dictionaries with the following keys: - - id (bytes) - - licenses ([str]): associated licenses for that content + id (bytes) + licenses ([str]): associated licenses for that content + tool (dict): Tool used to compute the license """ - db.store_tmp_bytea(ids, cur) - - for c in db.content_fossology_license_get_from_temp(): + for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) yield converters.db_to_fossology_license(license) @@ -339,24 +364,34 @@ Args: metadatas (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: missing ids + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_metadata_missing(cur) - db.copy_to(metadatas, 'tmp_content_metadata_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.content_metadata_missing_from_temp(cur): + for obj in db.content_metadata_missing_from_list(metadatas, cur): yield obj[0] @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_metadata_get_from_temp(): + """Retrieve metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @@ -368,8 +403,8 @@ Args: metadatas (iterable): dictionaries with keys: - - id: sha1 - - translated_metadata: bytes / jsonb ? + id: sha1 + translated_metadata: bytes / jsonb ? conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -390,24 +425,33 @@ Args: metadatas (iterable): dictionaries with keys: - - id (bytes): sha1_git revision identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1_git revision identifier + indexer_configuration_id (int): tool used to compute + the results Returns: iterable: missing ids """ - db.mktemp_revision_metadata_missing(cur) - db.copy_to(metadatas, 'tmp_revision_metadata_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.revision_metadata_missing_from_temp(cur): + for obj in db.revision_metadata_missing_from_list(metadatas, cur): yield obj[0] @db_transaction_generator() def revision_metadata_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.revision_metadata_get_from_temp(): + """Retrieve revision metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.revision_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_metadata_cols, c))) @@ -434,54 +478,6 @@ cur) db.revision_metadata_add_from_temp(conflict_update, cur) - @db_transaction() - def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, - db=None, cur=None): - """ Add an origin_metadata for the origin at ts with provenance and - metadata. - - Args: - origin_id (int): the origin's id for which the metadata is added - ts (datetime): timestamp of the found metadata - provider (int): the provider of metadata (ex:'hal') - tool (int): tool used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location - - Returns: - id (int): the origin_metadata unique id - """ - if isinstance(ts, str): - ts = dateutil.parser.parse(ts) - - return db.origin_metadata_add(origin_id, ts, provider, tool, - metadata, cur) - - @db_transaction_generator() - def origin_metadata_get_by(self, origin_id, provider_type=None, db=None, - cur=None): - """Retrieve list of all origin_metadata entries for the origin_id - - Args: - origin_id (int): the unique origin identifier - provider_type (str): (optional) type of provider - - Returns: - list of dicts: the origin_metadata dictionary with the keys: - - - id (int): origin_metadata's id - - origin_id (int): origin's id - - discovery_date (datetime): timestamp of discovery - - tool_id (int): metadata's extracting tool - - metadata (jsonb) - - provider_id (int): metadata's provider - - provider_name (str) - - provider_type (str) - - provider_url (str) - - """ - for line in db.origin_metadata_get_by(origin_id, provider_type, cur): - yield dict(zip(db.origin_metadata_get_cols, line)) - @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,7 +6,7 @@ from swh.model import hashutil from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes -from swh.storage.db import line_to_bytes +from swh.storage.db import line_to_bytes, execute_values_to_bytes class Db(BaseDb): @@ -24,20 +24,33 @@ self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', ['id'], cur) - content_mimetype_cols = [ - 'id', 'mimetype', 'encoding', - 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] - - @stored_procedure('swh_mktemp_content_mimetype_missing') - def mktemp_content_mimetype_missing(self, cur=None): pass + content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] - def content_mimetype_missing_from_temp(self, cur=None): + def content_mimetype_missing_from_list(self, mimetypes, cur=None): """List missing mimetypes. """ cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_mimetype_missing()") - yield from cursor_to_bytes(cur) + keys = ', '.join(self.content_mimetype_hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) + for key in self.content_mimetype_hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from content_mimetype c + where %s + ) + """ % (keys, keys, equality), + (tuple(m[k] for k in self.content_mimetype_hash_keys) + for m in mimetypes) + ) + + content_mimetype_cols = [ + 'id', 'mimetype', 'encoding', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_mimetype') def mktemp_content_mimetype(self, cur=None): pass @@ -46,12 +59,70 @@ self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", (conflict_update, )) - def content_mimetype_get_from_temp(self, cur=None): + def _convert_key(self, key, main_table='c'): + """Convert keys according to specific use in the module. + Args: + key (str): Key expression to change according to the alias + used in the query + main_table (str): Alias to use for the main table. Default + to c for content_{something}. + + Expected: + Tables content_{something} being aliased as 'c' (something + in {language, mimetype, ...}), table indexer_configuration + being aliased as 'i'. + + """ + if key == 'id': + return '%s.id' % main_table + elif key == 'tool_id': + return 'i.id as tool_id' + elif key == 'licenses': + return ''' + array(select name + from fossology_license + where id = ANY( + array_agg(%s.license_id))) as licenses''' % main_table + return key + + def content_mimetype_get_from_list(self, ids, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_mimetype_get()" % ( - ','.join(self.content_mimetype_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_mimetype_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_mimetype c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) + + content_language_hash_keys = ['id', 'indexer_configuration_id'] + + def content_language_missing_from_list(self, languages, cur=None): + """List missing languages. + + """ + cur = self._cursor(cur) + keys = ', '.join(self.content_language_hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) + for key in self.content_language_hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from content_language c + where %s + ) + """ % (keys, keys, equality), + (tuple(l[k] for k in self.content_language_hash_keys) + for l in languages) + ) content_language_cols = [ 'id', 'lang', @@ -60,27 +131,48 @@ @stored_procedure('swh_mktemp_content_language') def mktemp_content_language(self, cur=None): pass - @stored_procedure('swh_mktemp_content_language_missing') - def mktemp_content_language_missing(self, cur=None): pass - - def content_language_missing_from_temp(self, cur=None): - """List missing languages. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_language_missing()") - yield from cursor_to_bytes(cur) - def content_language_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_language_add(%s)", (conflict_update, )) - def content_language_get_from_temp(self, cur=None): + def content_language_get_from_list(self, ids, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_language_get()" % ( - ','.join(self.content_language_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_language_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_language c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) + + content_ctags_hash_keys = ['id', 'indexer_configuration_id'] + + def content_ctags_missing_from_list(self, ctags, cur=None): + """List missing ctags. + + """ + cur = self._cursor(cur) + keys = ', '.join(self.content_ctags_hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) + for key in self.content_ctags_hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from content_ctags c + where %s + ) + """ % (keys, keys, equality), + (tuple(c[k] for k in self.content_ctags_hash_keys) + for c in ctags) + ) content_ctags_cols = [ 'id', 'name', 'kind', 'line', 'lang', @@ -89,27 +181,24 @@ @stored_procedure('swh_mktemp_content_ctags') def mktemp_content_ctags(self, cur=None): pass - @stored_procedure('swh_mktemp_content_ctags_missing') - def mktemp_content_ctags_missing(self, cur=None): pass - - def content_ctags_missing_from_temp(self, cur=None): - """List missing ctags. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_ctags_missing()") - yield from cursor_to_bytes(cur) - def content_ctags_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", (conflict_update, )) - def content_ctags_get_from_temp(self, cur=None): + def content_ctags_get_from_list(self, ids, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_ctags_get()" % ( - ','.join(self.content_ctags_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_ctags_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_ctags c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) @@ -147,15 +236,48 @@ "SELECT swh_content_fossology_license_add(%s)", (conflict_update, )) - def content_fossology_license_get_from_temp(self, cur=None): - """Retrieve licenses per content. + def content_fossology_license_get_from_list(self, ids, cur=None): + """Retrieve licenses per id. """ cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_fossology_license_get()" % ( - ','.join(self.content_fossology_license_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_fossology_license_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_fossology_license c on t.id=c.id + inner join indexer_configuration i + on i.id=c.indexer_configuration_id + group by c.id, i.id, i.tool_name, i.tool_version, + i.tool_configuration; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) + + content_metadata_hash_keys = ['id', 'indexer_configuration_id'] + + def content_metadata_missing_from_list(self, metadata, cur=None): + """List missing metadata. + + """ + cur = self._cursor(cur) + keys = ', '.join(self.content_metadata_hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) + for key in self.content_metadata_hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from content_metadata c + where %s + ) + """ % (keys, keys, equality), + (tuple(m[k] for k in self.content_metadata_hash_keys) + for m in metadata) + ) content_metadata_cols = [ 'id', 'translated_metadata', @@ -164,27 +286,48 @@ @stored_procedure('swh_mktemp_content_metadata') def mktemp_content_metadata(self, cur=None): pass - @stored_procedure('swh_mktemp_content_metadata_missing') - def mktemp_content_metadata_missing(self, cur=None): pass - - def content_metadata_missing_from_temp(self, cur=None): - """List missing metadatas. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_metadata_missing()") - yield from cursor_to_bytes(cur) - def content_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", (conflict_update, )) - def content_metadata_get_from_temp(self, cur=None): + def content_metadata_get_from_list(self, ids, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_metadata_get()" % ( - ','.join(self.content_metadata_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_metadata_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_metadata c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) + + content_revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] + + def revision_metadata_missing_from_list(self, metadata, cur=None): + """List missing metadata. + + """ + cur = self._cursor(cur) + keys = ', '.join(self.content_revision_metadata_hash_keys) + equality = ' AND '.join( + ('t.%s = r.%s' % (key, key)) + for key in self.content_revision_metadata_hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from revision_metadata r + where %s + ) + """ % (keys, keys, equality), + (tuple(m[k] for k in self.content_revision_metadata_hash_keys) + for m in metadata) + ) revision_metadata_cols = [ 'id', 'translated_metadata', @@ -193,27 +336,25 @@ @stored_procedure('swh_mktemp_revision_metadata') def mktemp_revision_metadata(self, cur=None): pass - @stored_procedure('swh_mktemp_revision_metadata_missing') - def mktemp_revision_metadata_missing(self, cur=None): pass - - def revision_metadata_missing_from_temp(self, cur=None): - """List missing metadatas. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_revision_metadata_missing()") - yield from cursor_to_bytes(cur) - def revision_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", (conflict_update, )) - def revision_metadata_get_from_temp(self, cur=None): + def revision_metadata_get_from_list(self, ids, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_revision_metadata_get()" % ( - ','.join(self.revision_metadata_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(lambda k: self._convert_key(k, main_table='r'), + self.revision_metadata_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join revision_metadata r + on r.id=t.id + inner join indexer_configuration i + on r.indexer_configuration_id=i.id; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration']