Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7342982
D343.id1095.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
D343.id1095.diff
View Options
diff --git a/sql/swh-func.sql b/sql/swh-func.sql
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -38,39 +38,6 @@
end
$$;
--- create a temporary table for content_ctags tmp_content_mimetype_missing,
-create or replace function swh_mktemp_content_mimetype_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_mimetype_missing (
- id sha1,
- indexer_configuration_id bigint
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information';
-
--- check which entries of tmp_bytea are missing from content_mimetype
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_mimetype_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- (select id::sha1 from tmp_content_mimetype_missing as tmp
- where not exists
- (select 1 from content_mimetype as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id));
- return;
-end
-$$;
-
-comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information';
-
-- create a temporary table for content_mimetype tmp_content_mimetype,
create or replace function swh_mktemp_content_mimetype()
returns void
@@ -118,70 +85,6 @@
comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
-create type content_mimetype_signature as(
- id sha1,
- mimetype bytea,
- encoding bytea,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of content mimetype from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_mimetype_get()
- returns setof content_mimetype_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, mimetype, encoding,
- i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join content_mimetype c on c.id=t.id
- inner join indexer_configuration i on c.indexer_configuration_id=i.id;
- return;
-end
-$$;
-
-comment on function swh_content_mimetype_get() IS 'List content''s mimetypes';
-
--- create a temporary table for content_language tmp_content_language,
-create or replace function swh_mktemp_content_language_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_language_missing (
- id sha1,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language';
-
--- check which entries of tmp_bytea are missing from content_language
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_language_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- select id::sha1 from tmp_content_language_missing as tmp
- where not exists
- (select 1 from content_language as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
- return;
-end
-$$;
-
-comment on function swh_content_language_missing() IS 'Filter missing content languages';
-
-- add tmp_content_language entries to content_language, overwriting
-- duplicates if conflict_update is true, skipping duplicates otherwise.
--
@@ -228,34 +131,6 @@
comment on function swh_mktemp_content_language() is 'Helper table to add content language';
-create type content_language_signature as (
- id sha1,
- lang languages,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of content language from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_content_language_get()
- returns setof content_language_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join content_language c on c.id = t.id
- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
- return;
-end
-$$;
-
-comment on function swh_content_language_get() is 'List content''s language';
-
-- create a temporary table for content_ctags tmp_content_ctags,
create or replace function swh_mktemp_content_ctags()
@@ -298,40 +173,6 @@
comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
--- create a temporary table for content_ctags missing routine
-create or replace function swh_mktemp_content_ctags_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_ctags_missing (
- id sha1,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags';
-
--- check which entries of tmp_bytea are missing from content_ctags
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_ctags_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- (select id::sha1 from tmp_content_ctags_missing as tmp
- where not exists
- (select 1 from content_ctags as c
- where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id
- limit 1));
- return;
-end
-$$;
-
-comment on function swh_content_ctags_missing() IS 'Filter missing content ctags';
-
create type content_ctags_signature as (
id sha1,
name text,
@@ -344,27 +185,6 @@
tool_configuration jsonb
);
--- Retrieve list of content ctags from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_content_ctags_get()
- returns setof content_ctags_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, c.name, c.kind, c.line, c.lang,
- i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration
- from tmp_bytea t
- inner join content_ctags c using(id)
- inner join indexer_configuration i on i.id = c.indexer_configuration_id
- order by line;
- return;
-end
-$$;
-
-comment on function swh_content_ctags_get() IS 'List content ctags';
-
-- Search within ctags content.
--
create or replace function swh_content_ctags_search(
@@ -440,77 +260,7 @@
comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
-create type content_fossology_license_signature as (
- id sha1,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb,
- licenses text[]
-);
-
--- Retrieve list of content license from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_fossology_license_get()
- returns setof content_fossology_license_signature
- language plpgsql
-as $$
-begin
- return query
- select cl.id,
- ic.id as tool_id,
- ic.tool_name,
- ic.tool_version,
- ic.tool_configuration,
- array(select name
- from fossology_license
- where id = ANY(array_agg(cl.license_id))) as licenses
- from tmp_bytea tcl
- inner join content_fossology_license cl using(id)
- inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id
- group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration;
- return;
-end
-$$;
-
-comment on function swh_content_fossology_license_get() IS 'List content licenses';
-
-- content_metadata functions
---
--- create a temporary table for content_metadata tmp_content_metadata,
-create or replace function swh_mktemp_content_metadata_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_metadata_missing (
- id sha1,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata';
-
--- check which entries of tmp_bytea are missing from content_metadata
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_metadata_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- select id::sha1 from tmp_content_metadata_missing as tmp
- where not exists
- (select 1 from content_metadata as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
- return;
-end
-$$;
-
-comment on function swh_content_metadata_missing() IS 'Filter missing content metadata';
-- add tmp_content_metadata entries to content_metadata, overwriting
-- duplicates if conflict_update is true, skipping duplicates otherwise.
@@ -558,71 +308,8 @@
comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';
---
-create type content_metadata_signature as (
- id sha1,
- translated_metadata jsonb,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of content metadata from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_content_metadata_get()
- returns setof content_metadata_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join content_metadata c on c.id = t.id
- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
- return;
-end
-$$;
-
-comment on function swh_content_metadata_get() is 'List content''s metadata';
-- end content_metadata functions
--- revision_metadata functions
---
--- create a temporary table for revision_metadata tmp_revision_metadata,
-create or replace function swh_mktemp_revision_metadata_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_revision_metadata_missing (
- id sha1_git,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata';
-
--- check which entries of tmp_bytea are missing from revision_metadata
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_revision_metadata_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- select id::sha1 from tmp_revision_metadata_missing as tmp
- where not exists
- (select 1 from revision_metadata as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
- return;
-end
-$$;
-
-comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata';
-
-- add tmp_revision_metadata entries to revision_metadata, overwriting
-- duplicates if conflict_update is true, skipping duplicates otherwise.
--
@@ -669,33 +356,6 @@
comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata';
---
-create type revision_metadata_signature as (
- id sha1_git,
- translated_metadata jsonb,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of revision metadata from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_revision_metadata_get()
- returns setof revision_metadata_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join revision_metadata c on c.id = t.id
- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
- return;
-end
-$$;
-
create or replace function swh_mktemp_indexer_configuration()
returns void
language sql
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(114, now(), 'Work In Progress');
+ values(115, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/115.sql
@@ -0,0 +1,40 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 114
+-- to_version: 115
+-- description: Remove temporary table use in reading api
+
+insert into dbversion(version, release, description)
+values(115, now(), 'Work In Progress');
+
+drop function swh_mktemp_content_mimetype_missing();
+drop function swh_content_mimetype_missing();
+
+drop function swh_content_mimetype_get();
+drop type content_mimetype_signature;
+
+drop function swh_mktemp_content_language_missing();
+drop function swh_content_language_missing();
+
+drop function swh_content_language_get();
+drop type content_language_signature;
+
+drop function swh_mktemp_content_ctags_missing();
+drop function swh_content_ctags_missing();
+
+drop function swh_content_ctags_get();
+--drop type content_ctags_signature; -- still used in swh_content_ctags_search
+
+drop function swh_content_fossology_license_get();
+drop type content_fossology_license_signature;
+
+drop function swh_mktemp_content_metadata_missing();
+drop function swh_content_metadata_missing();
+
+drop function swh_content_metadata_get();
+drop type content_metadata_signature;
+
+drop function swh_mktemp_revision_metadata_missing();
+drop function swh_revision_metadata_missing();
+
+drop function swh_revision_metadata_get();
+drop type revision_metadata_signature;
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -95,20 +95,16 @@
Args:
mimetypes (iterable): iterable of dict with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- iterable: an iterable of missing id for the triplets id, tool_name,
- tool_version
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_mimetype_missing(cur)
- db.copy_to(mimetypes, 'tmp_content_mimetype_missing',
- ['id', 'indexer_configuration_id'],
- cur)
- for obj in db.content_mimetype_missing_from_temp(cur):
+ for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
yield obj[0]
@db_transaction()
@@ -119,13 +115,14 @@
Args:
mimetypes (iterable): dictionaries with keys:
- - id (bytes): sha1 identifier
- - mimetype (bytes): raw content's mimetype
- - encoding (bytes): raw content's encoding
- - indexer_configuration_id (int): tool's id used to
- compute the results
- - conflict_update: Flag to determine if we want to
- overwrite (true) or skip duplicates (false, the default)
+ id (bytes): sha1 identifier
+ mimetype (bytes): raw content's mimetype
+ encoding (bytes): raw content's encoding
+ indexer_configuration_id (int): tool's id used to
+ compute the results
+ conflict_update (bool): Flag to determine if we want to
+ overwrite (true) or skip duplicates
+ (false, the default)
"""
db.mktemp_content_mimetype(cur)
@@ -136,8 +133,21 @@
@db_transaction_generator()
def content_mimetype_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.content_mimetype_get_from_temp():
+ """Retrieve full content mimetype per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ mimetypes (iterable): dictionaries with keys:
+
+ id (bytes): sha1 identifier
+ mimetype (bytes): raw content's mimetype
+ encoding (bytes): raw content's encoding
+ tool (dict): Tool used to compute the language
+
+ """
+ for c in db.content_mimetype_get_from_list(ids, cur):
yield converters.db_to_mimetype(
dict(zip(db.content_mimetype_cols, c)))
@@ -148,24 +158,34 @@
Args:
languages (iterable): dictionaries with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- iterable: identifiers of missing languages
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_language_missing(cur)
- db.copy_to(languages, 'tmp_content_language_missing',
- ['id', 'indexer_configuration_id'], cur)
- for obj in db.content_language_missing_from_temp(cur):
+ for obj in db.content_language_missing_from_list(languages, cur):
yield obj[0]
@db_transaction_generator()
def content_language_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.content_language_get_from_temp():
+ """Retrieve full content language per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ languages (iterable): dictionaries with keys:
+
+ id (bytes): sha1 identifier
+ lang (bytes): raw content's language
+ tool (dict): Tool used to compute the language
+
+ """
+ for c in db.content_language_get_from_list(ids, cur):
yield converters.db_to_language(
dict(zip(db.content_language_cols, c)))
@@ -177,11 +197,12 @@
Args:
languages (iterable): dictionaries with keys:
- - id: sha1
- - lang: bytes
+ id (bytes): sha1
+ lang (bytes): language detected
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
+ conflict_update (bool): Flag to determine if we want to
+ overwrite (true) or skip duplicates (false, the
+ default)
"""
db.mktemp_content_language(cur)
@@ -204,20 +225,16 @@
Args:
ctags (iterable): dicts with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool name used
- - tool_version (str): associated version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- an iterable of missing id
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_ctags_missing(cur)
- db.copy_to(ctags,
- tblname='tmp_content_ctags_missing',
- columns=['id', 'indexer_configuration_id'],
- cur=cur)
- for obj in db.content_ctags_missing_from_temp(cur):
+ for obj in db.content_ctags_missing_from_list(ctags, cur):
yield obj[0]
@db_transaction_generator()
@@ -227,9 +244,18 @@
Args:
ids (iterable): sha1 checksums
+ Yields:
+ Dictionaries with keys:
+
+ id (bytes): content's identifier
+ name (str): symbol's name
+ kind (str): symbol's kind
+ language (str): language for that content
+ tool (dict): tool used to compute the ctags' info
+
+
"""
- db.store_tmp_bytea(ids, cur)
- for c in db.content_ctags_get_from_temp():
+ for c in db.content_ctags_get_from_list(ids, cur):
yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
@db_transaction()
@@ -240,9 +266,9 @@
Args:
ctags (iterable): dictionaries with keys:
- - id (bytes): sha1
- - ctags ([list): List of dictionary with keys: name, kind,
- line, language
+ id (bytes): sha1
+ ctags ([list): List of dictionary with keys: name, kind,
+ line, language
"""
def _convert_ctags(__ctags):
@@ -289,13 +315,12 @@
Yields:
list: dictionaries with the following keys:
- - id (bytes)
- - licenses ([str]): associated licenses for that content
+ id (bytes)
+ licenses ([str]): associated licenses for that content
+ tool (dict): Tool used to compute the license
"""
- db.store_tmp_bytea(ids, cur)
-
- for c in db.content_fossology_license_get_from_temp():
+ for c in db.content_fossology_license_get_from_list(ids, cur):
license = dict(zip(db.content_fossology_license_cols, c))
yield converters.db_to_fossology_license(license)
@@ -339,24 +364,34 @@
Args:
metadatas (iterable): dictionaries with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- iterable: missing ids
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_metadata_missing(cur)
- db.copy_to(metadatas, 'tmp_content_metadata_missing',
- ['id', 'indexer_configuration_id'], cur)
- for obj in db.content_metadata_missing_from_temp(cur):
+ for obj in db.content_metadata_missing_from_list(metadatas, cur):
yield obj[0]
@db_transaction_generator()
def content_metadata_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.content_metadata_get_from_temp():
+ """Retrieve metadata per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ id (bytes)
+ translated_metadata (str): associated metadata
+ tool (dict): tool used to compute metadata
+
+ """
+ for c in db.content_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.content_metadata_cols, c)))
@@ -368,8 +403,8 @@
Args:
metadatas (iterable): dictionaries with keys:
- - id: sha1
- - translated_metadata: bytes / jsonb ?
+ id: sha1
+ translated_metadata: bytes / jsonb ?
conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)
@@ -390,24 +425,33 @@
Args:
metadatas (iterable): dictionaries with keys:
- - id (bytes): sha1_git revision identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1_git revision identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
Returns:
iterable: missing ids
"""
- db.mktemp_revision_metadata_missing(cur)
- db.copy_to(metadatas, 'tmp_revision_metadata_missing',
- ['id', 'indexer_configuration_id'], cur)
- for obj in db.revision_metadata_missing_from_temp(cur):
+ for obj in db.revision_metadata_missing_from_list(metadatas, cur):
yield obj[0]
@db_transaction_generator()
def revision_metadata_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.revision_metadata_get_from_temp():
+ """Retrieve revision metadata per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ id (bytes)
+ translated_metadata (str): associated metadata
+ tool (dict): tool used to compute metadata
+
+ """
+ for c in db.revision_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.revision_metadata_cols, c)))
@@ -434,54 +478,6 @@
cur)
db.revision_metadata_add_from_temp(conflict_update, cur)
- @db_transaction()
- def origin_metadata_add(self, origin_id, ts, provider, tool, metadata,
- db=None, cur=None):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
-
- Args:
- origin_id (int): the origin's id for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider (int): the provider of metadata (ex:'hal')
- tool (int): tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
-
- Returns:
- id (int): the origin_metadata unique id
- """
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
-
- return db.origin_metadata_add(origin_id, ts, provider, tool,
- metadata, cur)
-
- @db_transaction_generator()
- def origin_metadata_get_by(self, origin_id, provider_type=None, db=None,
- cur=None):
- """Retrieve list of all origin_metadata entries for the origin_id
-
- Args:
- origin_id (int): the unique origin identifier
- provider_type (str): (optional) type of provider
-
- Returns:
- list of dicts: the origin_metadata dictionary with the keys:
-
- - id (int): origin_metadata's id
- - origin_id (int): origin's id
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
-
- """
- for line in db.origin_metadata_get_by(origin_id, provider_type, cur):
- yield dict(zip(db.origin_metadata_get_cols, line))
-
@db_transaction_generator()
def indexer_configuration_add(self, tools, db=None, cur=None):
"""Add new tools to the storage.
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,7 +6,7 @@
from swh.model import hashutil
from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes
-from swh.storage.db import line_to_bytes
+from swh.storage.db import line_to_bytes, execute_values_to_bytes
class Db(BaseDb):
@@ -24,20 +24,33 @@
self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea',
['id'], cur)
- content_mimetype_cols = [
- 'id', 'mimetype', 'encoding',
- 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
-
- @stored_procedure('swh_mktemp_content_mimetype_missing')
- def mktemp_content_mimetype_missing(self, cur=None): pass
+ content_mimetype_hash_keys = ['id', 'indexer_configuration_id']
- def content_mimetype_missing_from_temp(self, cur=None):
+ def content_mimetype_missing_from_list(self, mimetypes, cur=None):
"""List missing mimetypes.
"""
cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_mimetype_missing()")
- yield from cursor_to_bytes(cur)
+ keys = ', '.join(self.content_mimetype_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_mimetype_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_mimetype c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(m[k] for k in self.content_mimetype_hash_keys)
+ for m in mimetypes)
+ )
+
+ content_mimetype_cols = [
+ 'id', 'mimetype', 'encoding',
+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_content_mimetype')
def mktemp_content_mimetype(self, cur=None): pass
@@ -46,12 +59,70 @@
self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)",
(conflict_update, ))
- def content_mimetype_get_from_temp(self, cur=None):
+ def _convert_key(self, key, main_table='c'):
+ """Convert keys according to specific use in the module.
+ Args:
+ key (str): Key expression to change according to the alias
+ used in the query
+ main_table (str): Alias to use for the main table. Default
+ to c for content_{something}.
+
+ Expected:
+ Tables content_{something} being aliased as 'c' (something
+ in {language, mimetype, ...}), table indexer_configuration
+ being aliased as 'i'.
+
+ """
+ if key == 'id':
+ return '%s.id' % main_table
+ elif key == 'tool_id':
+ return 'i.id as tool_id'
+ elif key == 'licenses':
+ return '''
+ array(select name
+ from fossology_license
+ where id = ANY(
+ array_agg(%s.license_id))) as licenses''' % main_table
+ return key
+
+ def content_mimetype_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_mimetype_get()" % (
- ','.join(self.content_mimetype_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_mimetype_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_mimetype c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_language_hash_keys = ['id', 'indexer_configuration_id']
+
+ def content_language_missing_from_list(self, languages, cur=None):
+ """List missing languages.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_language_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_language_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_language c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(l[k] for k in self.content_language_hash_keys)
+ for l in languages)
+ )
content_language_cols = [
'id', 'lang',
@@ -60,27 +131,48 @@
@stored_procedure('swh_mktemp_content_language')
def mktemp_content_language(self, cur=None): pass
- @stored_procedure('swh_mktemp_content_language_missing')
- def mktemp_content_language_missing(self, cur=None): pass
-
- def content_language_missing_from_temp(self, cur=None):
- """List missing languages.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_language_missing()")
- yield from cursor_to_bytes(cur)
-
def content_language_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_language_add(%s)",
(conflict_update, ))
- def content_language_get_from_temp(self, cur=None):
+ def content_language_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_language_get()" % (
- ','.join(self.content_language_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_language_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_language c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_ctags_hash_keys = ['id', 'indexer_configuration_id']
+
+ def content_ctags_missing_from_list(self, ctags, cur=None):
+ """List missing ctags.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_ctags_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_ctags_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_ctags c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(c[k] for k in self.content_ctags_hash_keys)
+ for c in ctags)
+ )
content_ctags_cols = [
'id', 'name', 'kind', 'line', 'lang',
@@ -89,27 +181,24 @@
@stored_procedure('swh_mktemp_content_ctags')
def mktemp_content_ctags(self, cur=None): pass
- @stored_procedure('swh_mktemp_content_ctags_missing')
- def mktemp_content_ctags_missing(self, cur=None): pass
-
- def content_ctags_missing_from_temp(self, cur=None):
- """List missing ctags.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_ctags_missing()")
- yield from cursor_to_bytes(cur)
-
def content_ctags_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)",
(conflict_update, ))
- def content_ctags_get_from_temp(self, cur=None):
+ def content_ctags_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_ctags_get()" % (
- ','.join(self.content_ctags_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_ctags_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_ctags c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
def content_ctags_search(self, expression, last_sha1, limit, cur=None):
cur = self._cursor(cur)
@@ -147,15 +236,48 @@
"SELECT swh_content_fossology_license_add(%s)",
(conflict_update, ))
- def content_fossology_license_get_from_temp(self, cur=None):
- """Retrieve licenses per content.
+ def content_fossology_license_get_from_list(self, ids, cur=None):
+ """Retrieve licenses per id.
"""
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_fossology_license_get()" % (
- ','.join(self.content_fossology_license_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_fossology_license_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_fossology_license c on t.id=c.id
+ inner join indexer_configuration i
+ on i.id=c.indexer_configuration_id
+ group by c.id, i.id, i.tool_name, i.tool_version,
+ i.tool_configuration;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_metadata_hash_keys = ['id', 'indexer_configuration_id']
+
+ def content_metadata_missing_from_list(self, metadata, cur=None):
+ """List missing metadata.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_metadata_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_metadata_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_metadata c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(m[k] for k in self.content_metadata_hash_keys)
+ for m in metadata)
+ )
content_metadata_cols = [
'id', 'translated_metadata',
@@ -164,27 +286,48 @@
@stored_procedure('swh_mktemp_content_metadata')
def mktemp_content_metadata(self, cur=None): pass
- @stored_procedure('swh_mktemp_content_metadata_missing')
- def mktemp_content_metadata_missing(self, cur=None): pass
-
- def content_metadata_missing_from_temp(self, cur=None):
- """List missing metadatas.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_metadata_missing()")
- yield from cursor_to_bytes(cur)
-
def content_metadata_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
(conflict_update, ))
- def content_metadata_get_from_temp(self, cur=None):
+ def content_metadata_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_metadata_get()" % (
- ','.join(self.content_metadata_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_metadata_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_metadata c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_revision_metadata_hash_keys = ['id', 'indexer_configuration_id']
+
+ def revision_metadata_missing_from_list(self, metadata, cur=None):
+ """List missing metadata.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_revision_metadata_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = r.%s' % (key, key))
+ for key in self.content_revision_metadata_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from revision_metadata r
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(m[k] for k in self.content_revision_metadata_hash_keys)
+ for m in metadata)
+ )
revision_metadata_cols = [
'id', 'translated_metadata',
@@ -193,27 +336,25 @@
@stored_procedure('swh_mktemp_revision_metadata')
def mktemp_revision_metadata(self, cur=None): pass
- @stored_procedure('swh_mktemp_revision_metadata_missing')
- def mktemp_revision_metadata_missing(self, cur=None): pass
-
- def revision_metadata_missing_from_temp(self, cur=None):
- """List missing metadatas.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_revision_metadata_missing()")
- yield from cursor_to_bytes(cur)
-
def revision_metadata_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
(conflict_update, ))
- def revision_metadata_get_from_temp(self, cur=None):
+ def revision_metadata_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_revision_metadata_get()" % (
- ','.join(self.revision_metadata_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(lambda k: self._convert_key(k, main_table='r'),
+ self.revision_metadata_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join revision_metadata r
+ on r.id=t.id
+ inner join indexer_configuration i
+ on r.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mar 17 2025, 6:30 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231595
Attached To
D343: indexer: Migrate away from temporary table
Event Timeline
Log In to Comment