D343.id1095.diff
No OneTemporary
Actions

Size

42 KB

Subscribers

None

D343.id1095.diff
View Options

	diff --git a/sql/swh-func.sql b/sql/swh-func.sql
	--- a/sql/swh-func.sql
	+++ b/sql/swh-func.sql
	@@ -38,39 +38,6 @@
	end
	$$;

	--- create a temporary table for content_ctags tmp_content_mimetype_missing,
	-create or replace function swh_mktemp_content_mimetype_missing()
	- returns void
	- language sql
	-as $$
	- create temporary table tmp_content_mimetype_missing (
	- id sha1,
	- indexer_configuration_id bigint
	- ) on commit drop;
	-$$;
	-
	-comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information';
	-
	--- check which entries of tmp_bytea are missing from content_mimetype
	---
	--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_content_mimetype_missing()
	- returns setof sha1
	- language plpgsql
	-as $$
	-begin
	- return query
	- (select id::sha1 from tmp_content_mimetype_missing as tmp
	- where not exists
	- (select 1 from content_mimetype as c
	- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id));
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information';
	-
	-- create a temporary table for content_mimetype tmp_content_mimetype,
	create or replace function swh_mktemp_content_mimetype()
	returns void
	@@ -118,70 +85,6 @@

	comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';

	-create type content_mimetype_signature as(
	- id sha1,
	- mimetype bytea,
	- encoding bytea,
	- tool_id integer,
	- tool_name text,
	- tool_version text,
	- tool_configuration jsonb
	-);
	-
	--- Retrieve list of content mimetype from the temporary table.
	---
	--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_content_mimetype_get()
	- returns setof content_mimetype_signature
	- language plpgsql
	-as $$
	-begin
	- return query
	- select c.id, mimetype, encoding,
	- i.id as tool_id, tool_name, tool_version, tool_configuration
	- from tmp_bytea t
	- inner join content_mimetype c on c.id=t.id
	- inner join indexer_configuration i on c.indexer_configuration_id=i.id;
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_mimetype_get() IS 'List content''s mimetypes';
	-
	--- create a temporary table for content_language tmp_content_language,
	-create or replace function swh_mktemp_content_language_missing()
	- returns void
	- language sql
	-as $$
	- create temporary table tmp_content_language_missing (
	- id sha1,
	- indexer_configuration_id integer
	- ) on commit drop;
	-$$;
	-
	-comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language';
	-
	--- check which entries of tmp_bytea are missing from content_language
	---
	--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_content_language_missing()
	- returns setof sha1
	- language plpgsql
	-as $$
	-begin
	- return query
	- select id::sha1 from tmp_content_language_missing as tmp
	- where not exists
	- (select 1 from content_language as c
	- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_language_missing() IS 'Filter missing content languages';
	-
	-- add tmp_content_language entries to content_language, overwriting
	-- duplicates if conflict_update is true, skipping duplicates otherwise.
	--
	@@ -228,34 +131,6 @@

	comment on function swh_mktemp_content_language() is 'Helper table to add content language';

	-create type content_language_signature as (
	- id sha1,
	- lang languages,
	- tool_id integer,
	- tool_name text,
	- tool_version text,
	- tool_configuration jsonb
	-);
	-
	--- Retrieve list of content language from the temporary table.
	---
	--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
	-create or replace function swh_content_language_get()
	- returns setof content_language_signature
	- language plpgsql
	-as $$
	-begin
	- return query
	- select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration
	- from tmp_bytea t
	- inner join content_language c on c.id = t.id
	- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_language_get() is 'List content''s language';
	-

	-- create a temporary table for content_ctags tmp_content_ctags,
	create or replace function swh_mktemp_content_ctags()
	@@ -298,40 +173,6 @@

	comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';

	--- create a temporary table for content_ctags missing routine
	-create or replace function swh_mktemp_content_ctags_missing()
	- returns void
	- language sql
	-as $$
	- create temporary table tmp_content_ctags_missing (
	- id sha1,
	- indexer_configuration_id integer
	- ) on commit drop;
	-$$;
	-
	-comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags';
	-
	--- check which entries of tmp_bytea are missing from content_ctags
	---
	--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_content_ctags_missing()
	- returns setof sha1
	- language plpgsql
	-as $$
	-begin
	- return query
	- (select id::sha1 from tmp_content_ctags_missing as tmp
	- where not exists
	- (select 1 from content_ctags as c
	- where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id
	- limit 1));
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_ctags_missing() IS 'Filter missing content ctags';
	-
	create type content_ctags_signature as (
	id sha1,
	name text,
	@@ -344,27 +185,6 @@
	tool_configuration jsonb
	);

	--- Retrieve list of content ctags from the temporary table.
	---
	--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
	-create or replace function swh_content_ctags_get()
	- returns setof content_ctags_signature
	- language plpgsql
	-as $$
	-begin
	- return query
	- select c.id, c.name, c.kind, c.line, c.lang,
	- i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration
	- from tmp_bytea t
	- inner join content_ctags c using(id)
	- inner join indexer_configuration i on i.id = c.indexer_configuration_id
	- order by line;
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_ctags_get() IS 'List content ctags';
	-
	-- Search within ctags content.
	--
	create or replace function swh_content_ctags_search(
	@@ -440,77 +260,7 @@

	comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';

	-create type content_fossology_license_signature as (
	- id sha1,
	- tool_id integer,
	- tool_name text,
	- tool_version text,
	- tool_configuration jsonb,
	- licenses text[]
	-);
	-
	--- Retrieve list of content license from the temporary table.
	---
	--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_content_fossology_license_get()
	- returns setof content_fossology_license_signature
	- language plpgsql
	-as $$
	-begin
	- return query
	- select cl.id,
	- ic.id as tool_id,
	- ic.tool_name,
	- ic.tool_version,
	- ic.tool_configuration,
	- array(select name
	- from fossology_license
	- where id = ANY(array_agg(cl.license_id))) as licenses
	- from tmp_bytea tcl
	- inner join content_fossology_license cl using(id)
	- inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id
	- group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration;
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_fossology_license_get() IS 'List content licenses';
	-
	-- content_metadata functions
	---
	--- create a temporary table for content_metadata tmp_content_metadata,
	-create or replace function swh_mktemp_content_metadata_missing()
	- returns void
	- language sql
	-as $$
	- create temporary table tmp_content_metadata_missing (
	- id sha1,
	- indexer_configuration_id integer
	- ) on commit drop;
	-$$;
	-
	-comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata';
	-
	--- check which entries of tmp_bytea are missing from content_metadata
	---
	--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_content_metadata_missing()
	- returns setof sha1
	- language plpgsql
	-as $$
	-begin
	- return query
	- select id::sha1 from tmp_content_metadata_missing as tmp
	- where not exists
	- (select 1 from content_metadata as c
	- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_metadata_missing() IS 'Filter missing content metadata';

	-- add tmp_content_metadata entries to content_metadata, overwriting
	-- duplicates if conflict_update is true, skipping duplicates otherwise.
	@@ -558,71 +308,8 @@

	comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';

	---
	-create type content_metadata_signature as (
	- id sha1,
	- translated_metadata jsonb,
	- tool_id integer,
	- tool_name text,
	- tool_version text,
	- tool_configuration jsonb
	-);
	-
	--- Retrieve list of content metadata from the temporary table.
	---
	--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
	-create or replace function swh_content_metadata_get()
	- returns setof content_metadata_signature
	- language plpgsql
	-as $$
	-begin
	- return query
	- select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
	- from tmp_bytea t
	- inner join content_metadata c on c.id = t.id
	- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
	- return;
	-end
	-$$;
	-
	-comment on function swh_content_metadata_get() is 'List content''s metadata';
	-- end content_metadata functions

	--- revision_metadata functions
	---
	--- create a temporary table for revision_metadata tmp_revision_metadata,
	-create or replace function swh_mktemp_revision_metadata_missing()
	- returns void
	- language sql
	-as $$
	- create temporary table tmp_revision_metadata_missing (
	- id sha1_git,
	- indexer_configuration_id integer
	- ) on commit drop;
	-$$;
	-
	-comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata';
	-
	--- check which entries of tmp_bytea are missing from revision_metadata
	---
	--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
	--- 2. call this function
	-create or replace function swh_revision_metadata_missing()
	- returns setof sha1
	- language plpgsql
	-as $$
	-begin
	- return query
	- select id::sha1 from tmp_revision_metadata_missing as tmp
	- where not exists
	- (select 1 from revision_metadata as c
	- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
	- return;
	-end
	-$$;
	-
	-comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata';
	-
	-- add tmp_revision_metadata entries to revision_metadata, overwriting
	-- duplicates if conflict_update is true, skipping duplicates otherwise.
	--
	@@ -669,33 +356,6 @@

	comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata';

	---
	-create type revision_metadata_signature as (
	- id sha1_git,
	- translated_metadata jsonb,
	- tool_id integer,
	- tool_name text,
	- tool_version text,
	- tool_configuration jsonb
	-);
	-
	--- Retrieve list of revision metadata from the temporary table.
	---
	--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
	-create or replace function swh_revision_metadata_get()
	- returns setof revision_metadata_signature
	- language plpgsql
	-as $$
	-begin
	- return query
	- select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
	- from tmp_bytea t
	- inner join revision_metadata c on c.id = t.id
	- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
	- return;
	-end
	-$$;
	-
	create or replace function swh_mktemp_indexer_configuration()
	returns void
	language sql
	diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
	--- a/sql/swh-schema.sql
	+++ b/sql/swh-schema.sql
	@@ -14,7 +14,7 @@
	);

	insert into dbversion(version, release, description)
	- values(114, now(), 'Work In Progress');
	+ values(115, now(), 'Work In Progress');
	-- Computing metadata on sha1's contents

	-- a SHA1 checksum (not necessarily originating from Git)
	diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql
	new file mode 100644
	--- /dev/null
	+++ b/sql/upgrades/115.sql
	@@ -0,0 +1,40 @@
	+-- SWH Indexer DB schema upgrade
	+-- from_version: 114
	+-- to_version: 115
	+-- description: Remove temporary table use in reading api
	+
	+insert into dbversion(version, release, description)
	+values(115, now(), 'Work In Progress');
	+
	+drop function swh_mktemp_content_mimetype_missing();
	+drop function swh_content_mimetype_missing();
	+
	+drop function swh_content_mimetype_get();
	+drop type content_mimetype_signature;
	+
	+drop function swh_mktemp_content_language_missing();
	+drop function swh_content_language_missing();
	+
	+drop function swh_content_language_get();
	+drop type content_language_signature;
	+
	+drop function swh_mktemp_content_ctags_missing();
	+drop function swh_content_ctags_missing();
	+
	+drop function swh_content_ctags_get();
	+--drop type content_ctags_signature; -- still used in swh_content_ctags_search
	+
	+drop function swh_content_fossology_license_get();
	+drop type content_fossology_license_signature;
	+
	+drop function swh_mktemp_content_metadata_missing();
	+drop function swh_content_metadata_missing();
	+
	+drop function swh_content_metadata_get();
	+drop type content_metadata_signature;
	+
	+drop function swh_mktemp_revision_metadata_missing();
	+drop function swh_revision_metadata_missing();
	+
	+drop function swh_revision_metadata_get();
	+drop type revision_metadata_signature;
	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/__init__.py
	@@ -95,20 +95,16 @@
	Args:
	mimetypes (iterable): iterable of dict with keys:

	- - id (bytes): sha1 identifier
	- - tool_name (str): tool used to compute the results
	- - tool_version (str): associated tool's version
	+ id (bytes): sha1 identifier
	+ indexer_configuration_id (int): tool used to compute
	+ the results

	- Returns:
	- iterable: an iterable of missing id for the triplets id, tool_name,
	- tool_version
	+ Yields:
	+ an iterable of missing id for the tuple (id,
	+ indexer_configuration_id)

	"""
	- db.mktemp_content_mimetype_missing(cur)
	- db.copy_to(mimetypes, 'tmp_content_mimetype_missing',
	- ['id', 'indexer_configuration_id'],
	- cur)
	- for obj in db.content_mimetype_missing_from_temp(cur):
	+ for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
	yield obj[0]

	@db_transaction()
	@@ -119,13 +115,14 @@
	Args:
	mimetypes (iterable): dictionaries with keys:

	- - id (bytes): sha1 identifier
	- - mimetype (bytes): raw content's mimetype
	- - encoding (bytes): raw content's encoding
	- - indexer_configuration_id (int): tool's id used to
	- compute the results
	- - conflict_update: Flag to determine if we want to
	- overwrite (true) or skip duplicates (false, the default)
	+ id (bytes): sha1 identifier
	+ mimetype (bytes): raw content's mimetype
	+ encoding (bytes): raw content's encoding
	+ indexer_configuration_id (int): tool's id used to
	+ compute the results
	+ conflict_update (bool): Flag to determine if we want to
	+ overwrite (true) or skip duplicates
	+ (false, the default)

	"""
	db.mktemp_content_mimetype(cur)
	@@ -136,8 +133,21 @@

	@db_transaction_generator()
	def content_mimetype_get(self, ids, db=None, cur=None):
	- db.store_tmp_bytea(ids, cur)
	- for c in db.content_mimetype_get_from_temp():
	+ """Retrieve full content mimetype per ids.
	+
	+ Args:
	+ ids (iterable): sha1 identifier
	+
	+ Yields:
	+ mimetypes (iterable): dictionaries with keys:
	+
	+ id (bytes): sha1 identifier
	+ mimetype (bytes): raw content's mimetype
	+ encoding (bytes): raw content's encoding
	+ tool (dict): Tool used to compute the language
	+
	+ """
	+ for c in db.content_mimetype_get_from_list(ids, cur):
	yield converters.db_to_mimetype(
	dict(zip(db.content_mimetype_cols, c)))

	@@ -148,24 +158,34 @@
	Args:
	languages (iterable): dictionaries with keys:

	- - id (bytes): sha1 identifier
	- - tool_name (str): tool used to compute the results
	- - tool_version (str): associated tool's version
	+ id (bytes): sha1 identifier
	+ indexer_configuration_id (int): tool used to compute
	+ the results

	- Returns:
	- iterable: identifiers of missing languages
	+ Yields:
	+ an iterable of missing id for the tuple (id,
	+ indexer_configuration_id)

	"""
	- db.mktemp_content_language_missing(cur)
	- db.copy_to(languages, 'tmp_content_language_missing',
	- ['id', 'indexer_configuration_id'], cur)
	- for obj in db.content_language_missing_from_temp(cur):
	+ for obj in db.content_language_missing_from_list(languages, cur):
	yield obj[0]

	@db_transaction_generator()
	def content_language_get(self, ids, db=None, cur=None):
	- db.store_tmp_bytea(ids, cur)
	- for c in db.content_language_get_from_temp():
	+ """Retrieve full content language per ids.
	+
	+ Args:
	+ ids (iterable): sha1 identifier
	+
	+ Yields:
	+ languages (iterable): dictionaries with keys:
	+
	+ id (bytes): sha1 identifier
	+ lang (bytes): raw content's language
	+ tool (dict): Tool used to compute the language
	+
	+ """
	+ for c in db.content_language_get_from_list(ids, cur):
	yield converters.db_to_language(
	dict(zip(db.content_language_cols, c)))

	@@ -177,11 +197,12 @@
	Args:
	languages (iterable): dictionaries with keys:

	- - id: sha1
	- - lang: bytes
	+ id (bytes): sha1
	+ lang (bytes): language detected

	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	+ conflict_update (bool): Flag to determine if we want to
	+ overwrite (true) or skip duplicates (false, the
	+ default)

	"""
	db.mktemp_content_language(cur)
	@@ -204,20 +225,16 @@
	Args:
	ctags (iterable): dicts with keys:

	- - id (bytes): sha1 identifier
	- - tool_name (str): tool name used
	- - tool_version (str): associated version
	+ id (bytes): sha1 identifier
	+ indexer_configuration_id (int): tool used to compute
	+ the results

	- Returns:
	- an iterable of missing id
	+ Yields:
	+ an iterable of missing id for the tuple (id,
	+ indexer_configuration_id)

	"""
	- db.mktemp_content_ctags_missing(cur)
	- db.copy_to(ctags,
	- tblname='tmp_content_ctags_missing',
	- columns=['id', 'indexer_configuration_id'],
	- cur=cur)
	- for obj in db.content_ctags_missing_from_temp(cur):
	+ for obj in db.content_ctags_missing_from_list(ctags, cur):
	yield obj[0]

	@db_transaction_generator()
	@@ -227,9 +244,18 @@
	Args:
	ids (iterable): sha1 checksums

	+ Yields:
	+ Dictionaries with keys:
	+
	+ id (bytes): content's identifier
	+ name (str): symbol's name
	+ kind (str): symbol's kind
	+ language (str): language for that content
	+ tool (dict): tool used to compute the ctags' info
	+
	+
	"""
	- db.store_tmp_bytea(ids, cur)
	- for c in db.content_ctags_get_from_temp():
	+ for c in db.content_ctags_get_from_list(ids, cur):
	yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))

	@db_transaction()
	@@ -240,9 +266,9 @@
	Args:
	ctags (iterable): dictionaries with keys:

	- - id (bytes): sha1
	- - ctags ([list): List of dictionary with keys: name, kind,
	- line, language
	+ id (bytes): sha1
	+ ctags ([list): List of dictionary with keys: name, kind,
	+ line, language

	"""
	def _convert_ctags(__ctags):
	@@ -289,13 +315,12 @@
	Yields:
	list: dictionaries with the following keys:

	- - id (bytes)
	- - licenses ([str]): associated licenses for that content
	+ id (bytes)
	+ licenses ([str]): associated licenses for that content
	+ tool (dict): Tool used to compute the license

	"""
	- db.store_tmp_bytea(ids, cur)
	-
	- for c in db.content_fossology_license_get_from_temp():
	+ for c in db.content_fossology_license_get_from_list(ids, cur):
	license = dict(zip(db.content_fossology_license_cols, c))
	yield converters.db_to_fossology_license(license)

	@@ -339,24 +364,34 @@
	Args:
	metadatas (iterable): dictionaries with keys:

	- - id (bytes): sha1 identifier
	- - tool_name (str): tool used to compute the results
	- - tool_version (str): associated tool's version
	+ id (bytes): sha1 identifier
	+ indexer_configuration_id (int): tool used to compute
	+ the results

	- Returns:
	- iterable: missing ids
	+ Yields:
	+ an iterable of missing id for the tuple (id,
	+ indexer_configuration_id)

	"""
	- db.mktemp_content_metadata_missing(cur)
	- db.copy_to(metadatas, 'tmp_content_metadata_missing',
	- ['id', 'indexer_configuration_id'], cur)
	- for obj in db.content_metadata_missing_from_temp(cur):
	+ for obj in db.content_metadata_missing_from_list(metadatas, cur):
	yield obj[0]

	@db_transaction_generator()
	def content_metadata_get(self, ids, db=None, cur=None):
	- db.store_tmp_bytea(ids, cur)
	- for c in db.content_metadata_get_from_temp():
	+ """Retrieve metadata per id.
	+
	+ Args:
	+ ids (iterable): sha1 checksums
	+
	+ Yields:
	+ list: dictionaries with the following keys:
	+
	+ id (bytes)
	+ translated_metadata (str): associated metadata
	+ tool (dict): tool used to compute metadata
	+
	+ """
	+ for c in db.content_metadata_get_from_list(ids, cur):
	yield converters.db_to_metadata(
	dict(zip(db.content_metadata_cols, c)))

	@@ -368,8 +403,8 @@
	Args:
	metadatas (iterable): dictionaries with keys:

	- - id: sha1
	- - translated_metadata: bytes / jsonb ?
	+ id: sha1
	+ translated_metadata: bytes / jsonb ?

	conflict_update: Flag to determine if we want to overwrite (true)
	or skip duplicates (false, the default)
	@@ -390,24 +425,33 @@
	Args:
	metadatas (iterable): dictionaries with keys:

	- - id (bytes): sha1_git revision identifier
	- - tool_name (str): tool used to compute the results
	- - tool_version (str): associated tool's version
	+ id (bytes): sha1_git revision identifier
	+ indexer_configuration_id (int): tool used to compute
	+ the results

	Returns:
	iterable: missing ids

	"""
	- db.mktemp_revision_metadata_missing(cur)
	- db.copy_to(metadatas, 'tmp_revision_metadata_missing',
	- ['id', 'indexer_configuration_id'], cur)
	- for obj in db.revision_metadata_missing_from_temp(cur):
	+ for obj in db.revision_metadata_missing_from_list(metadatas, cur):
	yield obj[0]

	@db_transaction_generator()
	def revision_metadata_get(self, ids, db=None, cur=None):
	- db.store_tmp_bytea(ids, cur)
	- for c in db.revision_metadata_get_from_temp():
	+ """Retrieve revision metadata per id.
	+
	+ Args:
	+ ids (iterable): sha1 checksums
	+
	+ Yields:
	+ list: dictionaries with the following keys:
	+
	+ id (bytes)
	+ translated_metadata (str): associated metadata
	+ tool (dict): tool used to compute metadata
	+
	+ """
	+ for c in db.revision_metadata_get_from_list(ids, cur):
	yield converters.db_to_metadata(
	dict(zip(db.revision_metadata_cols, c)))

	@@ -434,54 +478,6 @@
	cur)
	db.revision_metadata_add_from_temp(conflict_update, cur)

	- @db_transaction()
	- def origin_metadata_add(self, origin_id, ts, provider, tool, metadata,
	- db=None, cur=None):
	- """ Add an origin_metadata for the origin at ts with provenance and
	- metadata.
	-
	- Args:
	- origin_id (int): the origin's id for which the metadata is added
	- ts (datetime): timestamp of the found metadata
	- provider (int): the provider of metadata (ex:'hal')
	- tool (int): tool used to extract metadata
	- metadata (jsonb): the metadata retrieved at the time and location
	-
	- Returns:
	- id (int): the origin_metadata unique id
	- """
	- if isinstance(ts, str):
	- ts = dateutil.parser.parse(ts)
	-
	- return db.origin_metadata_add(origin_id, ts, provider, tool,
	- metadata, cur)
	-
	- @db_transaction_generator()
	- def origin_metadata_get_by(self, origin_id, provider_type=None, db=None,
	- cur=None):
	- """Retrieve list of all origin_metadata entries for the origin_id
	-
	- Args:
	- origin_id (int): the unique origin identifier
	- provider_type (str): (optional) type of provider
	-
	- Returns:
	- list of dicts: the origin_metadata dictionary with the keys:
	-
	- - id (int): origin_metadata's id
	- - origin_id (int): origin's id
	- - discovery_date (datetime): timestamp of discovery
	- - tool_id (int): metadata's extracting tool
	- - metadata (jsonb)
	- - provider_id (int): metadata's provider
	- - provider_name (str)
	- - provider_type (str)
	- - provider_url (str)
	-
	- """
	- for line in db.origin_metadata_get_by(origin_id, provider_type, cur):
	- yield dict(zip(db.origin_metadata_get_cols, line))
	-
	@db_transaction_generator()
	def indexer_configuration_add(self, tools, db=None, cur=None):
	"""Add new tools to the storage.
	diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
	--- a/swh/indexer/storage/db.py
	+++ b/swh/indexer/storage/db.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2015-2017 The Software Heritage developers
	+# Copyright (C) 2015-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -6,7 +6,7 @@
	from swh.model import hashutil

	from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes
	-from swh.storage.db import line_to_bytes
	+from swh.storage.db import line_to_bytes, execute_values_to_bytes


	class Db(BaseDb):
	@@ -24,20 +24,33 @@
	self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea',
	['id'], cur)

	- content_mimetype_cols = [
	- 'id', 'mimetype', 'encoding',
	- 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
	-
	- @stored_procedure('swh_mktemp_content_mimetype_missing')
	- def mktemp_content_mimetype_missing(self, cur=None): pass
	+ content_mimetype_hash_keys = ['id', 'indexer_configuration_id']

	- def content_mimetype_missing_from_temp(self, cur=None):
	+ def content_mimetype_missing_from_list(self, mimetypes, cur=None):
	"""List missing mimetypes.

	"""
	cur = self._cursor(cur)
	- cur.execute("SELECT * FROM swh_content_mimetype_missing()")
	- yield from cursor_to_bytes(cur)
	+ keys = ', '.join(self.content_mimetype_hash_keys)
	+ equality = ' AND '.join(
	+ ('t.%s = c.%s' % (key, key))
	+ for key in self.content_mimetype_hash_keys
	+ )
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s from (values %%s) as t(%s)
	+ where not exists (
	+ select 1 from content_mimetype c
	+ where %s
	+ )
	+ """ % (keys, keys, equality),
	+ (tuple(m[k] for k in self.content_mimetype_hash_keys)
	+ for m in mimetypes)
	+ )
	+
	+ content_mimetype_cols = [
	+ 'id', 'mimetype', 'encoding',
	+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']

	@stored_procedure('swh_mktemp_content_mimetype')
	def mktemp_content_mimetype(self, cur=None): pass
	@@ -46,12 +59,70 @@
	self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)",
	(conflict_update, ))

	- def content_mimetype_get_from_temp(self, cur=None):
	+ def _convert_key(self, key, main_table='c'):
	+ """Convert keys according to specific use in the module.
	+ Args:
	+ key (str): Key expression to change according to the alias
	+ used in the query
	+ main_table (str): Alias to use for the main table. Default
	+ to c for content_{something}.
	+
	+ Expected:
	+ Tables content_{something} being aliased as 'c' (something
	+ in {language, mimetype, ...}), table indexer_configuration
	+ being aliased as 'i'.
	+
	+ """
	+ if key == 'id':
	+ return '%s.id' % main_table
	+ elif key == 'tool_id':
	+ return 'i.id as tool_id'
	+ elif key == 'licenses':
	+ return '''
	+ array(select name
	+ from fossology_license
	+ where id = ANY(
	+ array_agg(%s.license_id))) as licenses''' % main_table
	+ return key
	+
	+ def content_mimetype_get_from_list(self, ids, cur=None):
	cur = self._cursor(cur)
	- query = "SELECT %s FROM swh_content_mimetype_get()" % (
	- ','.join(self.content_mimetype_cols))
	- cur.execute(query)
	- yield from cursor_to_bytes(cur)
	+ keys = map(self._convert_key, self.content_mimetype_cols)
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s
	+ from (values %%s) as t(id)
	+ inner join content_mimetype c
	+ on c.id=t.id
	+ inner join indexer_configuration i
	+ on c.indexer_configuration_id=i.id;
	+ """ % ', '.join(keys),
	+ ((_id,) for _id in ids)
	+ )
	+
	+ content_language_hash_keys = ['id', 'indexer_configuration_id']
	+
	+ def content_language_missing_from_list(self, languages, cur=None):
	+ """List missing languages.
	+
	+ """
	+ cur = self._cursor(cur)
	+ keys = ', '.join(self.content_language_hash_keys)
	+ equality = ' AND '.join(
	+ ('t.%s = c.%s' % (key, key))
	+ for key in self.content_language_hash_keys
	+ )
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s from (values %%s) as t(%s)
	+ where not exists (
	+ select 1 from content_language c
	+ where %s
	+ )
	+ """ % (keys, keys, equality),
	+ (tuple(l[k] for k in self.content_language_hash_keys)
	+ for l in languages)
	+ )

	content_language_cols = [
	'id', 'lang',
	@@ -60,27 +131,48 @@
	@stored_procedure('swh_mktemp_content_language')
	def mktemp_content_language(self, cur=None): pass

	- @stored_procedure('swh_mktemp_content_language_missing')
	- def mktemp_content_language_missing(self, cur=None): pass
	-
	- def content_language_missing_from_temp(self, cur=None):
	- """List missing languages.
	-
	- """
	- cur = self._cursor(cur)
	- cur.execute("SELECT * FROM swh_content_language_missing()")
	- yield from cursor_to_bytes(cur)
	-
	def content_language_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_language_add(%s)",
	(conflict_update, ))

	- def content_language_get_from_temp(self, cur=None):
	+ def content_language_get_from_list(self, ids, cur=None):
	cur = self._cursor(cur)
	- query = "SELECT %s FROM swh_content_language_get()" % (
	- ','.join(self.content_language_cols))
	- cur.execute(query)
	- yield from cursor_to_bytes(cur)
	+ keys = map(self._convert_key, self.content_language_cols)
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s
	+ from (values %%s) as t(id)
	+ inner join content_language c
	+ on c.id=t.id
	+ inner join indexer_configuration i
	+ on c.indexer_configuration_id=i.id;
	+ """ % ', '.join(keys),
	+ ((_id,) for _id in ids)
	+ )
	+
	+ content_ctags_hash_keys = ['id', 'indexer_configuration_id']
	+
	+ def content_ctags_missing_from_list(self, ctags, cur=None):
	+ """List missing ctags.
	+
	+ """
	+ cur = self._cursor(cur)
	+ keys = ', '.join(self.content_ctags_hash_keys)
	+ equality = ' AND '.join(
	+ ('t.%s = c.%s' % (key, key))
	+ for key in self.content_ctags_hash_keys
	+ )
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s from (values %%s) as t(%s)
	+ where not exists (
	+ select 1 from content_ctags c
	+ where %s
	+ )
	+ """ % (keys, keys, equality),
	+ (tuple(c[k] for k in self.content_ctags_hash_keys)
	+ for c in ctags)
	+ )

	content_ctags_cols = [
	'id', 'name', 'kind', 'line', 'lang',
	@@ -89,27 +181,24 @@
	@stored_procedure('swh_mktemp_content_ctags')
	def mktemp_content_ctags(self, cur=None): pass

	- @stored_procedure('swh_mktemp_content_ctags_missing')
	- def mktemp_content_ctags_missing(self, cur=None): pass
	-
	- def content_ctags_missing_from_temp(self, cur=None):
	- """List missing ctags.
	-
	- """
	- cur = self._cursor(cur)
	- cur.execute("SELECT * FROM swh_content_ctags_missing()")
	- yield from cursor_to_bytes(cur)
	-
	def content_ctags_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)",
	(conflict_update, ))

	- def content_ctags_get_from_temp(self, cur=None):
	+ def content_ctags_get_from_list(self, ids, cur=None):
	cur = self._cursor(cur)
	- query = "SELECT %s FROM swh_content_ctags_get()" % (
	- ','.join(self.content_ctags_cols))
	- cur.execute(query)
	- yield from cursor_to_bytes(cur)
	+ keys = map(self._convert_key, self.content_ctags_cols)
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s
	+ from (values %%s) as t(id)
	+ inner join content_ctags c
	+ on c.id=t.id
	+ inner join indexer_configuration i
	+ on c.indexer_configuration_id=i.id;
	+ """ % ', '.join(keys),
	+ ((_id,) for _id in ids)
	+ )

	def content_ctags_search(self, expression, last_sha1, limit, cur=None):
	cur = self._cursor(cur)
	@@ -147,15 +236,48 @@
	"SELECT swh_content_fossology_license_add(%s)",
	(conflict_update, ))

	- def content_fossology_license_get_from_temp(self, cur=None):
	- """Retrieve licenses per content.
	+ def content_fossology_license_get_from_list(self, ids, cur=None):
	+ """Retrieve licenses per id.

	"""
	cur = self._cursor(cur)
	- query = "SELECT %s FROM swh_content_fossology_license_get()" % (
	- ','.join(self.content_fossology_license_cols))
	- cur.execute(query)
	- yield from cursor_to_bytes(cur)
	+ keys = map(self._convert_key, self.content_fossology_license_cols)
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s
	+ from (values %%s) as t(id)
	+ inner join content_fossology_license c on t.id=c.id
	+ inner join indexer_configuration i
	+ on i.id=c.indexer_configuration_id
	+ group by c.id, i.id, i.tool_name, i.tool_version,
	+ i.tool_configuration;
	+ """ % ', '.join(keys),
	+ ((_id,) for _id in ids)
	+ )
	+
	+ content_metadata_hash_keys = ['id', 'indexer_configuration_id']
	+
	+ def content_metadata_missing_from_list(self, metadata, cur=None):
	+ """List missing metadata.
	+
	+ """
	+ cur = self._cursor(cur)
	+ keys = ', '.join(self.content_metadata_hash_keys)
	+ equality = ' AND '.join(
	+ ('t.%s = c.%s' % (key, key))
	+ for key in self.content_metadata_hash_keys
	+ )
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s from (values %%s) as t(%s)
	+ where not exists (
	+ select 1 from content_metadata c
	+ where %s
	+ )
	+ """ % (keys, keys, equality),
	+ (tuple(m[k] for k in self.content_metadata_hash_keys)
	+ for m in metadata)
	+ )

	content_metadata_cols = [
	'id', 'translated_metadata',
	@@ -164,27 +286,48 @@
	@stored_procedure('swh_mktemp_content_metadata')
	def mktemp_content_metadata(self, cur=None): pass

	- @stored_procedure('swh_mktemp_content_metadata_missing')
	- def mktemp_content_metadata_missing(self, cur=None): pass
	-
	- def content_metadata_missing_from_temp(self, cur=None):
	- """List missing metadatas.
	-
	- """
	- cur = self._cursor(cur)
	- cur.execute("SELECT * FROM swh_content_metadata_missing()")
	- yield from cursor_to_bytes(cur)
	-
	def content_metadata_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
	(conflict_update, ))

	- def content_metadata_get_from_temp(self, cur=None):
	+ def content_metadata_get_from_list(self, ids, cur=None):
	cur = self._cursor(cur)
	- query = "SELECT %s FROM swh_content_metadata_get()" % (
	- ','.join(self.content_metadata_cols))
	- cur.execute(query)
	- yield from cursor_to_bytes(cur)
	+ keys = map(self._convert_key, self.content_metadata_cols)
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s
	+ from (values %%s) as t(id)
	+ inner join content_metadata c
	+ on c.id=t.id
	+ inner join indexer_configuration i
	+ on c.indexer_configuration_id=i.id;
	+ """ % ', '.join(keys),
	+ ((_id,) for _id in ids)
	+ )
	+
	+ content_revision_metadata_hash_keys = ['id', 'indexer_configuration_id']
	+
	+ def revision_metadata_missing_from_list(self, metadata, cur=None):
	+ """List missing metadata.
	+
	+ """
	+ cur = self._cursor(cur)
	+ keys = ', '.join(self.content_revision_metadata_hash_keys)
	+ equality = ' AND '.join(
	+ ('t.%s = r.%s' % (key, key))
	+ for key in self.content_revision_metadata_hash_keys
	+ )
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s from (values %%s) as t(%s)
	+ where not exists (
	+ select 1 from revision_metadata r
	+ where %s
	+ )
	+ """ % (keys, keys, equality),
	+ (tuple(m[k] for k in self.content_revision_metadata_hash_keys)
	+ for m in metadata)
	+ )

	revision_metadata_cols = [
	'id', 'translated_metadata',
	@@ -193,27 +336,25 @@
	@stored_procedure('swh_mktemp_revision_metadata')
	def mktemp_revision_metadata(self, cur=None): pass

	- @stored_procedure('swh_mktemp_revision_metadata_missing')
	- def mktemp_revision_metadata_missing(self, cur=None): pass
	-
	- def revision_metadata_missing_from_temp(self, cur=None):
	- """List missing metadatas.
	-
	- """
	- cur = self._cursor(cur)
	- cur.execute("SELECT * FROM swh_revision_metadata_missing()")
	- yield from cursor_to_bytes(cur)
	-
	def revision_metadata_add_from_temp(self, conflict_update, cur=None):
	self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
	(conflict_update, ))

	- def revision_metadata_get_from_temp(self, cur=None):
	+ def revision_metadata_get_from_list(self, ids, cur=None):
	cur = self._cursor(cur)
	- query = "SELECT %s FROM swh_revision_metadata_get()" % (
	- ','.join(self.revision_metadata_cols))
	- cur.execute(query)
	- yield from cursor_to_bytes(cur)
	+ keys = map(lambda k: self._convert_key(k, main_table='r'),
	+ self.revision_metadata_cols)
	+ yield from execute_values_to_bytes(
	+ cur, """
	+ select %s
	+ from (values %%s) as t(id)
	+ inner join revision_metadata r
	+ on r.id=t.id
	+ inner join indexer_configuration i
	+ on r.indexer_configuration_id=i.id;
	+ """ % ', '.join(keys),
	+ ((_id,) for _id in ids)
	+ )

	indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
	'tool_configuration']

File Metadata

Mime Type: text/plain
Expires: Mar 17 2025, 6:30 PM (7 w, 3 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3231595

D343.id1095.diffNo OneTemporaryActions

D343.id1095.diffView Options

File Metadata

Event Timeline

D343.id1095.diff
No OneTemporary
Actions

D343.id1095.diff
View Options