diff --git a/sql/swh-func.sql b/sql/swh-func.sql --- a/sql/swh-func.sql +++ b/sql/swh-func.sql @@ -8,16 +8,6 @@ comment on function hash_sha1(text) is 'Compute sha1 hash as text'; --- create a temporary table with a single "bytea" column for fast object lookup. -create or replace function swh_mktemp_bytea() - returns void - language sql -as $$ - create temporary table tmp_bytea ( - id bytea - ) on commit drop; -$$; - -- create a temporary table called tmp_TBLNAME, mimicking existing table -- TBLNAME -- @@ -38,39 +28,6 @@ end $$; --- create a temporary table for content_ctags tmp_content_mimetype_missing, -create or replace function swh_mktemp_content_mimetype_missing() - returns void - language sql -as $$ - create temporary table tmp_content_mimetype_missing ( - id sha1, - indexer_configuration_id bigint - ) on commit drop; -$$; - -comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information'; - --- check which entries of tmp_bytea are missing from content_mimetype --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_mimetype_missing as tmp - where not exists - (select 1 from content_mimetype as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id)); - return; -end -$$; - -comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information'; - -- create a temporary table for content_mimetype tmp_content_mimetype, create or replace function swh_mktemp_content_mimetype() returns void @@ -118,70 +75,6 @@ comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; -create type content_mimetype_signature as( - id sha1, - mimetype bytea, - encoding bytea, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content mimetype from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_mimetype_get() - returns setof content_mimetype_signature - language plpgsql -as $$ -begin - return query - select c.id, mimetype, encoding, - i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_mimetype c on c.id=t.id - inner join indexer_configuration i on c.indexer_configuration_id=i.id; - return; -end -$$; - -comment on function swh_content_mimetype_get() IS 'List content''s mimetypes'; - --- create a temporary table for content_language tmp_content_language, -create or replace function swh_mktemp_content_language_missing() - returns void - language sql -as $$ - create temporary table tmp_content_language_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language'; - --- check which entries of tmp_bytea are missing from content_language --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_language_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_content_language_missing as tmp - where not exists - (select 1 from content_language as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_content_language_missing() IS 'Filter missing content languages'; - -- add tmp_content_language entries to content_language, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- @@ -228,34 +121,6 @@ comment on function swh_mktemp_content_language() is 'Helper table to add content language'; -create type content_language_signature as ( - id sha1, - lang languages, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content language from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_language_get() - returns setof content_language_signature - language plpgsql -as $$ -begin - return query - select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_language c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - -comment on function swh_content_language_get() is 'List content''s language'; - -- create a temporary table for content_ctags tmp_content_ctags, create or replace function swh_mktemp_content_ctags() @@ -298,40 +163,6 @@ comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; --- create a temporary table for content_ctags missing routine -create or replace function swh_mktemp_content_ctags_missing() - returns void - language sql -as $$ - create temporary table tmp_content_ctags_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags'; - --- check which entries of tmp_bytea are missing from content_ctags --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_ctags_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - (select id::sha1 from tmp_content_ctags_missing as tmp - where not exists - (select 1 from content_ctags as c - where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id - limit 1)); - return; -end -$$; - -comment on function swh_content_ctags_missing() IS 'Filter missing content ctags'; - create type content_ctags_signature as ( id sha1, name text, @@ -344,27 +175,6 @@ tool_configuration jsonb ); --- Retrieve list of content ctags from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_ctags_get() - returns setof content_ctags_signature - language plpgsql -as $$ -begin - return query - select c.id, c.name, c.kind, c.line, c.lang, - i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration - from tmp_bytea t - inner join content_ctags c using(id) - inner join indexer_configuration i on i.id = c.indexer_configuration_id - order by line; - return; -end -$$; - -comment on function swh_content_ctags_get() IS 'List content ctags'; - -- Search within ctags content. -- create or replace function swh_content_ctags_search( @@ -440,77 +250,7 @@ comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; -create type content_fossology_license_signature as ( - id sha1, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb, - licenses text[] -); - --- Retrieve list of content license from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_fossology_license_get() - returns setof content_fossology_license_signature - language plpgsql -as $$ -begin - return query - select cl.id, - ic.id as tool_id, - ic.tool_name, - ic.tool_version, - ic.tool_configuration, - array(select name - from fossology_license - where id = ANY(array_agg(cl.license_id))) as licenses - from tmp_bytea tcl - inner join content_fossology_license cl using(id) - inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id - group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration; - return; -end -$$; - -comment on function swh_content_fossology_license_get() IS 'List content licenses'; - -- content_metadata functions --- --- create a temporary table for content_metadata tmp_content_metadata, -create or replace function swh_mktemp_content_metadata_missing() - returns void - language sql -as $$ - create temporary table tmp_content_metadata_missing ( - id sha1, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; - --- check which entries of tmp_bytea are missing from content_metadata --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_content_metadata_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_content_metadata_missing as tmp - where not exists - (select 1 from content_metadata as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; -- add tmp_content_metadata entries to content_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. @@ -558,71 +298,8 @@ comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; --- -create type content_metadata_signature as ( - id sha1, - translated_metadata jsonb, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of content metadata from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_content_metadata_get() - returns setof content_metadata_signature - language plpgsql -as $$ -begin - return query - select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join content_metadata c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - -comment on function swh_content_metadata_get() is 'List content''s metadata'; -- end content_metadata functions --- revision_metadata functions --- --- create a temporary table for revision_metadata tmp_revision_metadata, -create or replace function swh_mktemp_revision_metadata_missing() - returns void - language sql -as $$ - create temporary table tmp_revision_metadata_missing ( - id sha1_git, - indexer_configuration_id integer - ) on commit drop; -$$; - -comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; - --- check which entries of tmp_bytea are missing from revision_metadata --- --- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, --- 2. call this function -create or replace function swh_revision_metadata_missing() - returns setof sha1 - language plpgsql -as $$ -begin - return query - select id::sha1 from tmp_revision_metadata_missing as tmp - where not exists - (select 1 from revision_metadata as c - where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); - return; -end -$$; - -comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; - -- add tmp_revision_metadata entries to revision_metadata, overwriting -- duplicates if conflict_update is true, skipping duplicates otherwise. -- @@ -669,33 +346,6 @@ comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; --- -create type revision_metadata_signature as ( - id sha1_git, - translated_metadata jsonb, - tool_id integer, - tool_name text, - tool_version text, - tool_configuration jsonb -); - --- Retrieve list of revision metadata from the temporary table. --- --- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function -create or replace function swh_revision_metadata_get() - returns setof revision_metadata_signature - language plpgsql -as $$ -begin - return query - select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration - from tmp_bytea t - inner join revision_metadata c on c.id = t.id - inner join indexer_configuration i on i.id=c.indexer_configuration_id; - return; -end -$$; - create or replace function swh_mktemp_indexer_configuration() returns void language sql diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql --- a/sql/swh-schema.sql +++ b/sql/swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(114, now(), 'Work In Progress'); + values(115, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/115.sql @@ -0,0 +1,42 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 114 +-- to_version: 115 +-- description: Remove temporary table use in reading api + +insert into dbversion(version, release, description) +values(115, now(), 'Work In Progress'); + +drop function swh_mktemp_content_mimetype_missing(); +drop function swh_content_mimetype_missing(); + +drop function swh_content_mimetype_get(); +drop type content_mimetype_signature; + +drop function swh_mktemp_content_language_missing(); +drop function swh_content_language_missing(); + +drop function swh_content_language_get(); +drop type content_language_signature; + +drop function swh_mktemp_content_ctags_missing(); +drop function swh_content_ctags_missing(); + +drop function swh_content_ctags_get(); +--drop type content_ctags_signature; -- still used in swh_content_ctags_search + +drop function swh_content_fossology_license_get(); +drop type content_fossology_license_signature; + +drop function swh_mktemp_content_metadata_missing(); +drop function swh_content_metadata_missing(); + +drop function swh_content_metadata_get(); +drop type content_metadata_signature; + +drop function swh_mktemp_revision_metadata_missing(); +drop function swh_revision_metadata_missing(); + +drop function swh_revision_metadata_get(); +drop type revision_metadata_signature; + +drop function swh_mktemp_bytea(); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -5,7 +5,6 @@ import json -import dateutil.parser import psycopg2 from swh.storage.common import db_transaction_generator, db_transaction @@ -95,20 +94,16 @@ Args: mimetypes (iterable): iterable of dict with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: an iterable of missing id for the triplets id, tool_name, - tool_version + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_mimetype_missing(cur) - db.copy_to(mimetypes, 'tmp_content_mimetype_missing', - ['id', 'indexer_configuration_id'], - cur) - for obj in db.content_mimetype_missing_from_temp(cur): + for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] @db_transaction() @@ -119,13 +114,14 @@ Args: mimetypes (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - mimetype (bytes): raw content's mimetype - - encoding (bytes): raw content's encoding - - indexer_configuration_id (int): tool's id used to - compute the results - - conflict_update: Flag to determine if we want to - overwrite (true) or skip duplicates (false, the default) + id (bytes): sha1 identifier + mimetype (bytes): raw content's mimetype + encoding (bytes): raw content's encoding + indexer_configuration_id (int): tool's id used to + compute the results + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates + (false, the default) """ db.mktemp_content_mimetype(cur) @@ -136,8 +132,21 @@ @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_mimetype_get_from_temp(): + """Retrieve full content mimetype per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + mimetypes (iterable): dictionaries with keys: + + id (bytes): sha1 identifier + mimetype (bytes): raw content's mimetype + encoding (bytes): raw content's encoding + tool (dict): Tool used to compute the language + + """ + for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @@ -148,24 +157,34 @@ Args: languages (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: identifiers of missing languages + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_language_missing(cur) - db.copy_to(languages, 'tmp_content_language_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.content_language_missing_from_temp(cur): + for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_language_get_from_temp(): + """Retrieve full content language per ids. + + Args: + ids (iterable): sha1 identifier + + Yields: + languages (iterable): dictionaries with keys: + + id (bytes): sha1 identifier + lang (bytes): raw content's language + tool (dict): Tool used to compute the language + + """ + for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @@ -177,11 +196,12 @@ Args: languages (iterable): dictionaries with keys: - - id: sha1 - - lang: bytes + id (bytes): sha1 + lang (bytes): language detected - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) + conflict_update (bool): Flag to determine if we want to + overwrite (true) or skip duplicates (false, the + default) """ db.mktemp_content_language(cur) @@ -204,20 +224,16 @@ Args: ctags (iterable): dicts with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool name used - - tool_version (str): associated version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - an iterable of missing id + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_ctags_missing(cur) - db.copy_to(ctags, - tblname='tmp_content_ctags_missing', - columns=['id', 'indexer_configuration_id'], - cur=cur) - for obj in db.content_ctags_missing_from_temp(cur): + for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @db_transaction_generator() @@ -227,9 +243,18 @@ Args: ids (iterable): sha1 checksums + Yields: + Dictionaries with keys: + + id (bytes): content's identifier + name (str): symbol's name + kind (str): symbol's kind + language (str): language for that content + tool (dict): tool used to compute the ctags' info + + """ - db.store_tmp_bytea(ids, cur) - for c in db.content_ctags_get_from_temp(): + for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @db_transaction() @@ -240,9 +265,9 @@ Args: ctags (iterable): dictionaries with keys: - - id (bytes): sha1 - - ctags ([list): List of dictionary with keys: name, kind, - line, language + id (bytes): sha1 + ctags ([list): List of dictionary with keys: name, kind, + line, language """ def _convert_ctags(__ctags): @@ -289,13 +314,12 @@ Yields: list: dictionaries with the following keys: - - id (bytes) - - licenses ([str]): associated licenses for that content + id (bytes) + licenses ([str]): associated licenses for that content + tool (dict): Tool used to compute the license """ - db.store_tmp_bytea(ids, cur) - - for c in db.content_fossology_license_get_from_temp(): + for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) yield converters.db_to_fossology_license(license) @@ -333,43 +357,53 @@ db.content_fossology_license_add_from_temp(conflict_update, cur) @db_transaction_generator() - def content_metadata_missing(self, metadatas, db=None, cur=None): - """List metadatas missing from storage. + def content_metadata_missing(self, metadata, db=None, cur=None): + """List metadata missing from storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - - id (bytes): sha1 identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1 identifier + indexer_configuration_id (int): tool used to compute + the results - Returns: - iterable: missing ids + Yields: + an iterable of missing id for the tuple (id, + indexer_configuration_id) """ - db.mktemp_content_metadata_missing(cur) - db.copy_to(metadatas, 'tmp_content_metadata_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.content_metadata_missing_from_temp(cur): + for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.content_metadata_get_from_temp(): + """Retrieve metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @db_transaction() - def content_metadata_add(self, metadatas, conflict_update=False, db=None, + def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadatas not present in storage. + """Add metadata not present in storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - - id: sha1 - - translated_metadata: bytes / jsonb ? + id: sha1 + translated_metadata: bytes / jsonb ? conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) @@ -378,46 +412,55 @@ db.mktemp_content_metadata(cur) # empty metadata is mapped to 'unknown' - db.copy_to(metadatas, 'tmp_content_metadata', + db.copy_to(metadata, 'tmp_content_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @db_transaction_generator() - def revision_metadata_missing(self, metadatas, db=None, cur=None): - """List metadatas missing from storage. + def revision_metadata_missing(self, metadata, db=None, cur=None): + """List metadata missing from storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - - id (bytes): sha1_git revision identifier - - tool_name (str): tool used to compute the results - - tool_version (str): associated tool's version + id (bytes): sha1_git revision identifier + indexer_configuration_id (int): tool used to compute + the results Returns: iterable: missing ids """ - db.mktemp_revision_metadata_missing(cur) - db.copy_to(metadatas, 'tmp_revision_metadata_missing', - ['id', 'indexer_configuration_id'], cur) - for obj in db.revision_metadata_missing_from_temp(cur): + for obj in db.revision_metadata_missing_from_list(metadata, cur): yield obj[0] @db_transaction_generator() def revision_metadata_get(self, ids, db=None, cur=None): - db.store_tmp_bytea(ids, cur) - for c in db.revision_metadata_get_from_temp(): + """Retrieve revision metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for c in db.revision_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_metadata_cols, c))) @db_transaction() - def revision_metadata_add(self, metadatas, conflict_update=False, db=None, + def revision_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadatas not present in storage. + """Add metadata not present in storage. Args: - metadatas (iterable): dictionaries with keys: + metadata (iterable): dictionaries with keys: - id: sha1_git of revision - translated_metadata: bytes / jsonb ? @@ -429,59 +472,11 @@ db.mktemp_revision_metadata(cur) # empty metadata is mapped to 'unknown' - db.copy_to(metadatas, 'tmp_revision_metadata', + db.copy_to(metadata, 'tmp_revision_metadata', ['id', 'translated_metadata', 'indexer_configuration_id'], cur) db.revision_metadata_add_from_temp(conflict_update, cur) - @db_transaction() - def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, - db=None, cur=None): - """ Add an origin_metadata for the origin at ts with provenance and - metadata. - - Args: - origin_id (int): the origin's id for which the metadata is added - ts (datetime): timestamp of the found metadata - provider (int): the provider of metadata (ex:'hal') - tool (int): tool used to extract metadata - metadata (jsonb): the metadata retrieved at the time and location - - Returns: - id (int): the origin_metadata unique id - """ - if isinstance(ts, str): - ts = dateutil.parser.parse(ts) - - return db.origin_metadata_add(origin_id, ts, provider, tool, - metadata, cur) - - @db_transaction_generator() - def origin_metadata_get_by(self, origin_id, provider_type=None, db=None, - cur=None): - """Retrieve list of all origin_metadata entries for the origin_id - - Args: - origin_id (int): the unique origin identifier - provider_type (str): (optional) type of provider - - Returns: - list of dicts: the origin_metadata dictionary with the keys: - - - id (int): origin_metadata's id - - origin_id (int): origin's id - - discovery_date (datetime): timestamp of discovery - - tool_id (int): metadata's extracting tool - - metadata (jsonb) - - provider_id (int): metadata's provider - - provider_name (str) - - provider_type (str) - - provider_url (str) - - """ - for line in db.origin_metadata_get_by(origin_id, provider_type, cur): - yield dict(zip(db.origin_metadata_get_cols, line)) - @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): """Add new tools to the storage. diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -70,26 +70,26 @@ def content_fossology_license_get(self, ids): return self.post('content/fossology_license', {'ids': ids}) - def content_metadata_add(self, metadatas, conflict_update=False): + def content_metadata_add(self, metadata, conflict_update=False): return self.post('content_metadata/add', { - 'metadatas': metadatas, + 'metadata': metadata, 'conflict_update': conflict_update, }) - def content_metadata_missing(self, metadatas): - return self.post('content_metadata/missing', {'metadatas': metadatas}) + def content_metadata_missing(self, metadata): + return self.post('content_metadata/missing', {'metadata': metadata}) def content_metadata_get(self, ids): return self.post('content_metadata', {'ids': ids}) - def revision_metadata_add(self, metadatas, conflict_update=False): + def revision_metadata_add(self, metadata, conflict_update=False): return self.post('revision_metadata/add', { - 'metadatas': metadatas, + 'metadata': metadata, 'conflict_update': conflict_update, }) - def revision_metadata_missing(self, metadatas): - return self.post('revision_metadata/missing', {'metadatas': metadatas}) + def revision_metadata_missing(self, metadata): + return self.post('revision_metadata/missing', {'metadata': metadata}) def revision_metadata_get(self, ids): return self.post('revision_metadata', {'ids': ids}) diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,38 +6,55 @@ from swh.model import hashutil from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes -from swh.storage.db import line_to_bytes +from swh.storage.db import line_to_bytes, execute_values_to_bytes class Db(BaseDb): """Proxy to the SWH Indexer DB, with wrappers around stored procedures """ - @stored_procedure('swh_mktemp_bytea') - def mktemp_bytea(self, cur=None): pass + content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] - def store_tmp_bytea(self, ids, cur=None): - """Store the given identifiers in a new tmp_bytea table""" - cur = self._cursor(cur) - - self.mktemp_bytea(cur) - self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', - ['id'], cur) + def _missing_from_list(self, table, data, hash_keys, cur=None): + """Read from table the data with hash_keys that are missing. - content_mimetype_cols = [ - 'id', 'mimetype', 'encoding', - 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + Args: + table (str): Table name (e.g content_mimetype, content_language, + etc...) + data (dict): Dict of data to read from + hash_keys ([str]): List of keys to read in the data dict. - @stored_procedure('swh_mktemp_content_mimetype_missing') - def mktemp_content_mimetype_missing(self, cur=None): pass + Yields: + The data which is missing from the db. - def content_mimetype_missing_from_temp(self, cur=None): + """ + cur = self._cursor(cur) + keys = ', '.join(hash_keys) + equality = ' AND '.join( + ('t.%s = c.%s' % (key, key)) for key in hash_keys + ) + yield from execute_values_to_bytes( + cur, """ + select %s from (values %%s) as t(%s) + where not exists ( + select 1 from %s c + where %s + ) + """ % (keys, keys, table, equality), + (tuple(m[k] for k in hash_keys) for m in data) + ) + + def content_mimetype_missing_from_list(self, mimetypes, cur=None): """List missing mimetypes. """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_mimetype_missing()") - yield from cursor_to_bytes(cur) + yield from self._missing_from_list( + 'content_mimetype', mimetypes, self.content_mimetype_hash_keys, + cur=cur) + + content_mimetype_cols = [ + 'id', 'mimetype', 'encoding', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_mimetype') def mktemp_content_mimetype(self, cur=None): pass @@ -46,12 +63,60 @@ self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", (conflict_update, )) - def content_mimetype_get_from_temp(self, cur=None): + def _convert_key(self, key, main_table='c'): + """Convert keys according to specific use in the module. + Args: + key (str): Key expression to change according to the alias + used in the query + main_table (str): Alias to use for the main table. Default + to c for content_{something}. + + Expected: + Tables content_{something} being aliased as 'c' (something + in {language, mimetype, ...}), table indexer_configuration + being aliased as 'i'. + + """ + if key == 'id': + return '%s.id' % main_table + elif key == 'tool_id': + return 'i.id as tool_id' + elif key == 'licenses': + return ''' + array(select name + from fossology_license + where id = ANY( + array_agg(%s.license_id))) as licenses''' % main_table + return key + + def _get_from_list(self, table, ids, cols, cur=None): cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_mimetype_get()" % ( - ','.join(self.content_mimetype_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join %s c + on c.id=t.id + inner join indexer_configuration i + on c.indexer_configuration_id=i.id; + """ % (', '.join(keys), table), + ((_id,) for _id in ids) + ) + + def content_mimetype_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_mimetype', ids, self.content_mimetype_cols, cur=cur) + + content_language_hash_keys = ['id', 'indexer_configuration_id'] + + def content_language_missing_from_list(self, languages, cur=None): + """List missing languages. + + """ + yield from self._missing_from_list( + 'content_language', languages, self.content_language_hash_keys, + cur=cur) content_language_cols = [ 'id', 'lang', @@ -60,27 +125,23 @@ @stored_procedure('swh_mktemp_content_language') def mktemp_content_language(self, cur=None): pass - @stored_procedure('swh_mktemp_content_language_missing') - def mktemp_content_language_missing(self, cur=None): pass - - def content_language_missing_from_temp(self, cur=None): - """List missing languages. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_language_missing()") - yield from cursor_to_bytes(cur) - def content_language_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_language_add(%s)", (conflict_update, )) - def content_language_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_language_get()" % ( - ','.join(self.content_language_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def content_language_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_language', ids, self.content_language_cols, cur=cur) + + content_ctags_hash_keys = ['id', 'indexer_configuration_id'] + + def content_ctags_missing_from_list(self, ctags, cur=None): + """List missing ctags. + + """ + yield from self._missing_from_list( + 'content_ctags', ctags, self.content_ctags_hash_keys, + cur=cur) content_ctags_cols = [ 'id', 'name', 'kind', 'line', 'lang', @@ -89,27 +150,13 @@ @stored_procedure('swh_mktemp_content_ctags') def mktemp_content_ctags(self, cur=None): pass - @stored_procedure('swh_mktemp_content_ctags_missing') - def mktemp_content_ctags_missing(self, cur=None): pass - - def content_ctags_missing_from_temp(self, cur=None): - """List missing ctags. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_ctags_missing()") - yield from cursor_to_bytes(cur) - def content_ctags_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", (conflict_update, )) - def content_ctags_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_ctags_get()" % ( - ','.join(self.content_ctags_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def content_ctags_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_ctags', ids, self.content_ctags_cols, cur=cur) def content_ctags_search(self, expression, last_sha1, limit, cur=None): cur = self._cursor(cur) @@ -147,15 +194,34 @@ "SELECT swh_content_fossology_license_add(%s)", (conflict_update, )) - def content_fossology_license_get_from_temp(self, cur=None): - """Retrieve licenses per content. + def content_fossology_license_get_from_list(self, ids, cur=None): + """Retrieve licenses per id. """ cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_fossology_license_get()" % ( - ','.join(self.content_fossology_license_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + keys = map(self._convert_key, self.content_fossology_license_cols) + yield from execute_values_to_bytes( + cur, """ + select %s + from (values %%s) as t(id) + inner join content_fossology_license c on t.id=c.id + inner join indexer_configuration i + on i.id=c.indexer_configuration_id + group by c.id, i.id, i.tool_name, i.tool_version, + i.tool_configuration; + """ % ', '.join(keys), + ((_id,) for _id in ids) + ) + + content_metadata_hash_keys = ['id', 'indexer_configuration_id'] + + def content_metadata_missing_from_list(self, metadata, cur=None): + """List missing metadata. + + """ + yield from self._missing_from_list( + 'content_metadata', metadata, self.content_metadata_hash_keys, + cur=cur) content_metadata_cols = [ 'id', 'translated_metadata', @@ -164,27 +230,23 @@ @stored_procedure('swh_mktemp_content_metadata') def mktemp_content_metadata(self, cur=None): pass - @stored_procedure('swh_mktemp_content_metadata_missing') - def mktemp_content_metadata_missing(self, cur=None): pass - - def content_metadata_missing_from_temp(self, cur=None): - """List missing metadatas. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_content_metadata_missing()") - yield from cursor_to_bytes(cur) - def content_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", (conflict_update, )) - def content_metadata_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_content_metadata_get()" % ( - ','.join(self.content_metadata_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def content_metadata_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'content_metadata', ids, self.content_metadata_cols, cur=cur) + + revision_metadata_hash_keys = ['id', 'indexer_configuration_id'] + + def revision_metadata_missing_from_list(self, metadata, cur=None): + """List missing metadata. + + """ + yield from self._missing_from_list( + 'revision_metadata', metadata, self.revision_metadata_hash_keys, + cur=cur) revision_metadata_cols = [ 'id', 'translated_metadata', @@ -193,27 +255,13 @@ @stored_procedure('swh_mktemp_revision_metadata') def mktemp_revision_metadata(self, cur=None): pass - @stored_procedure('swh_mktemp_revision_metadata_missing') - def mktemp_revision_metadata_missing(self, cur=None): pass - - def revision_metadata_missing_from_temp(self, cur=None): - """List missing metadatas. - - """ - cur = self._cursor(cur) - cur.execute("SELECT * FROM swh_revision_metadata_missing()") - yield from cursor_to_bytes(cur) - def revision_metadata_add_from_temp(self, conflict_update, cur=None): self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", (conflict_update, )) - def revision_metadata_get_from_temp(self, cur=None): - cur = self._cursor(cur) - query = "SELECT %s FROM swh_revision_metadata_get()" % ( - ','.join(self.revision_metadata_cols)) - cur.execute(query) - yield from cursor_to_bytes(cur) + def revision_metadata_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -930,7 +930,7 @@ # given tool_id = self.tools['swh-metadata-translator']['id'] - metadatas = [ + metadata = [ { 'id': self.sha1_2, 'indexer_configuration_id': tool_id, @@ -942,7 +942,7 @@ ] # when - actual_missing = list(self.storage.content_metadata_missing(metadatas)) + actual_missing = list(self.storage.content_metadata_missing(metadata)) # then self.assertEqual(list(actual_missing), [ @@ -967,7 +967,7 @@ }]) # when - actual_missing = list(self.storage.content_metadata_missing(metadatas)) + actual_missing = list(self.storage.content_metadata_missing(metadata)) # then self.assertEqual(actual_missing, [self.sha1_1]) @@ -996,10 +996,10 @@ self.storage.content_metadata_add([metadata1]) # then - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2, self.sha1_1])) - expected_metadatas = [{ + expected_metadata = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, @@ -1014,7 +1014,7 @@ 'tool': self.tools['swh-metadata-translator'] }] - self.assertEqual(actual_metadatas, expected_metadatas) + self.assertEqual(actual_metadata, expected_metadata) @istest def content_metadata_add_drop_duplicate(self): @@ -1035,10 +1035,10 @@ self.storage.content_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, @@ -1048,7 +1048,7 @@ 'tool': self.tools['swh-metadata-translator'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() @@ -1063,11 +1063,11 @@ self.storage.content_metadata_add([metadata_v2]) # then - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # metadata did not change as the v2 was dropped. - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) @istest def content_metadata_add_update_in_place_duplicate(self): @@ -1088,11 +1088,11 @@ self.storage.content_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # then - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, @@ -1101,7 +1101,7 @@ }, 'tool': self.tools['swh-metadata-translator'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() @@ -1114,11 +1114,11 @@ }) self.storage.content_metadata_add([metadata_v2], conflict_update=True) - actual_metadatas = list(self.storage.content_metadata_get( + actual_metadata = list(self.storage.content_metadata_get( [self.sha1_2])) # language did not change as the v2 was dropped. - expected_metadatas_v2 = [{ + expected_metadata_v2 = [{ 'id': self.sha1_2, 'translated_metadata': { 'other': {}, @@ -1129,14 +1129,14 @@ }] # metadata did change as the v2 was used to overwrite v1 - self.assertEqual(actual_metadatas, expected_metadatas_v2) + self.assertEqual(actual_metadata, expected_metadata_v2) @istest def revision_metadata_missing(self): # given tool_id = self.tools['swh-metadata-detector']['id'] - metadatas = [ + metadata = [ { 'id': self.revision_id_1, 'indexer_configuration_id': tool_id, @@ -1149,7 +1149,7 @@ # when actual_missing = list(self.storage.revision_metadata_missing( - metadatas)) + metadata)) # then self.assertEqual(list(actual_missing), [ @@ -1183,7 +1183,7 @@ # when actual_missing = list(self.storage.revision_metadata_missing( - metadatas)) + metadata)) # then self.assertEqual(actual_missing, [self.revision_id_2]) @@ -1220,16 +1220,16 @@ self.storage.revision_metadata_add([metadata_rev]) # then - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2, self.revision_id_1])) - expected_metadatas = [{ + expected_metadata = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_rev['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] - self.assertEqual(actual_metadatas, expected_metadatas) + self.assertEqual(actual_metadata, expected_metadata) @istest def revision_metadata_add_drop_duplicate(self): @@ -1263,16 +1263,16 @@ self.storage.revision_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_1])) - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.revision_id_1, 'translated_metadata': metadata_v1['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() @@ -1286,11 +1286,11 @@ self.storage.revision_metadata_add([metadata_v2]) # then - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_1])) # metadata did not change as the v2 was dropped. - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) @istest def revision_metadata_add_update_in_place_duplicate(self): @@ -1324,16 +1324,16 @@ self.storage.revision_metadata_add([metadata_v1]) # when - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) # then - expected_metadatas_v1 = [{ + expected_metadata_v1 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v1['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] - self.assertEqual(actual_metadatas, expected_metadatas_v1) + self.assertEqual(actual_metadata, expected_metadata_v1) # given metadata_v2 = metadata_v1.copy() @@ -1345,18 +1345,18 @@ }) self.storage.revision_metadata_add([metadata_v2], conflict_update=True) - actual_metadatas = list(self.storage.revision_metadata_get( + actual_metadata = list(self.storage.revision_metadata_get( [self.revision_id_2])) # language did not change as the v2 was dropped. - expected_metadatas_v2 = [{ + expected_metadata_v2 = [{ 'id': self.revision_id_2, 'translated_metadata': metadata_v2['translated_metadata'], 'tool': self.tools['swh-metadata-detector'] }] # metadata did change as the v2 was used to overwrite v1 - self.assertEqual(actual_metadatas, expected_metadatas_v2) + self.assertEqual(actual_metadata, expected_metadata_v2) @istest def indexer_configuration_add(self):