Page MenuHomeSoftware Heritage

D343.id1095.diff
No OneTemporary

D343.id1095.diff

diff --git a/sql/swh-func.sql b/sql/swh-func.sql
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -38,39 +38,6 @@
end
$$;
--- create a temporary table for content_ctags tmp_content_mimetype_missing,
-create or replace function swh_mktemp_content_mimetype_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_mimetype_missing (
- id sha1,
- indexer_configuration_id bigint
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information';
-
--- check which entries of tmp_bytea are missing from content_mimetype
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_mimetype_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- (select id::sha1 from tmp_content_mimetype_missing as tmp
- where not exists
- (select 1 from content_mimetype as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id));
- return;
-end
-$$;
-
-comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information';
-
-- create a temporary table for content_mimetype tmp_content_mimetype,
create or replace function swh_mktemp_content_mimetype()
returns void
@@ -118,70 +85,6 @@
comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes';
-create type content_mimetype_signature as(
- id sha1,
- mimetype bytea,
- encoding bytea,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of content mimetype from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_mimetype_get()
- returns setof content_mimetype_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, mimetype, encoding,
- i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join content_mimetype c on c.id=t.id
- inner join indexer_configuration i on c.indexer_configuration_id=i.id;
- return;
-end
-$$;
-
-comment on function swh_content_mimetype_get() IS 'List content''s mimetypes';
-
--- create a temporary table for content_language tmp_content_language,
-create or replace function swh_mktemp_content_language_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_language_missing (
- id sha1,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language';
-
--- check which entries of tmp_bytea are missing from content_language
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_language_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- select id::sha1 from tmp_content_language_missing as tmp
- where not exists
- (select 1 from content_language as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
- return;
-end
-$$;
-
-comment on function swh_content_language_missing() IS 'Filter missing content languages';
-
-- add tmp_content_language entries to content_language, overwriting
-- duplicates if conflict_update is true, skipping duplicates otherwise.
--
@@ -228,34 +131,6 @@
comment on function swh_mktemp_content_language() is 'Helper table to add content language';
-create type content_language_signature as (
- id sha1,
- lang languages,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of content language from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_content_language_get()
- returns setof content_language_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join content_language c on c.id = t.id
- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
- return;
-end
-$$;
-
-comment on function swh_content_language_get() is 'List content''s language';
-
-- create a temporary table for content_ctags tmp_content_ctags,
create or replace function swh_mktemp_content_ctags()
@@ -298,40 +173,6 @@
comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content';
--- create a temporary table for content_ctags missing routine
-create or replace function swh_mktemp_content_ctags_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_ctags_missing (
- id sha1,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags';
-
--- check which entries of tmp_bytea are missing from content_ctags
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_ctags_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- (select id::sha1 from tmp_content_ctags_missing as tmp
- where not exists
- (select 1 from content_ctags as c
- where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id
- limit 1));
- return;
-end
-$$;
-
-comment on function swh_content_ctags_missing() IS 'Filter missing content ctags';
-
create type content_ctags_signature as (
id sha1,
name text,
@@ -344,27 +185,6 @@
tool_configuration jsonb
);
--- Retrieve list of content ctags from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_content_ctags_get()
- returns setof content_ctags_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, c.name, c.kind, c.line, c.lang,
- i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration
- from tmp_bytea t
- inner join content_ctags c using(id)
- inner join indexer_configuration i on i.id = c.indexer_configuration_id
- order by line;
- return;
-end
-$$;
-
-comment on function swh_content_ctags_get() IS 'List content ctags';
-
-- Search within ctags content.
--
create or replace function swh_content_ctags_search(
@@ -440,77 +260,7 @@
comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses';
-create type content_fossology_license_signature as (
- id sha1,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb,
- licenses text[]
-);
-
--- Retrieve list of content license from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_fossology_license_get()
- returns setof content_fossology_license_signature
- language plpgsql
-as $$
-begin
- return query
- select cl.id,
- ic.id as tool_id,
- ic.tool_name,
- ic.tool_version,
- ic.tool_configuration,
- array(select name
- from fossology_license
- where id = ANY(array_agg(cl.license_id))) as licenses
- from tmp_bytea tcl
- inner join content_fossology_license cl using(id)
- inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id
- group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration;
- return;
-end
-$$;
-
-comment on function swh_content_fossology_license_get() IS 'List content licenses';
-
-- content_metadata functions
---
--- create a temporary table for content_metadata tmp_content_metadata,
-create or replace function swh_mktemp_content_metadata_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_content_metadata_missing (
- id sha1,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata';
-
--- check which entries of tmp_bytea are missing from content_metadata
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_content_metadata_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- select id::sha1 from tmp_content_metadata_missing as tmp
- where not exists
- (select 1 from content_metadata as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
- return;
-end
-$$;
-
-comment on function swh_content_metadata_missing() IS 'Filter missing content metadata';
-- add tmp_content_metadata entries to content_metadata, overwriting
-- duplicates if conflict_update is true, skipping duplicates otherwise.
@@ -558,71 +308,8 @@
comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';
---
-create type content_metadata_signature as (
- id sha1,
- translated_metadata jsonb,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of content metadata from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_content_metadata_get()
- returns setof content_metadata_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join content_metadata c on c.id = t.id
- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
- return;
-end
-$$;
-
-comment on function swh_content_metadata_get() is 'List content''s metadata';
-- end content_metadata functions
--- revision_metadata functions
---
--- create a temporary table for revision_metadata tmp_revision_metadata,
-create or replace function swh_mktemp_revision_metadata_missing()
- returns void
- language sql
-as $$
- create temporary table tmp_revision_metadata_missing (
- id sha1_git,
- indexer_configuration_id integer
- ) on commit drop;
-$$;
-
-comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata';
-
--- check which entries of tmp_bytea are missing from revision_metadata
---
--- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
--- 2. call this function
-create or replace function swh_revision_metadata_missing()
- returns setof sha1
- language plpgsql
-as $$
-begin
- return query
- select id::sha1 from tmp_revision_metadata_missing as tmp
- where not exists
- (select 1 from revision_metadata as c
- where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
- return;
-end
-$$;
-
-comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata';
-
-- add tmp_revision_metadata entries to revision_metadata, overwriting
-- duplicates if conflict_update is true, skipping duplicates otherwise.
--
@@ -669,33 +356,6 @@
comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata';
---
-create type revision_metadata_signature as (
- id sha1_git,
- translated_metadata jsonb,
- tool_id integer,
- tool_name text,
- tool_version text,
- tool_configuration jsonb
-);
-
--- Retrieve list of revision metadata from the temporary table.
---
--- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
-create or replace function swh_revision_metadata_get()
- returns setof revision_metadata_signature
- language plpgsql
-as $$
-begin
- return query
- select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
- from tmp_bytea t
- inner join revision_metadata c on c.id = t.id
- inner join indexer_configuration i on i.id=c.indexer_configuration_id;
- return;
-end
-$$;
-
create or replace function swh_mktemp_indexer_configuration()
returns void
language sql
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(114, now(), 'Work In Progress');
+ values(115, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
diff --git a/sql/upgrades/115.sql b/sql/upgrades/115.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/115.sql
@@ -0,0 +1,40 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 114
+-- to_version: 115
+-- description: Remove temporary table use in reading api
+
+insert into dbversion(version, release, description)
+values(115, now(), 'Work In Progress');
+
+drop function swh_mktemp_content_mimetype_missing();
+drop function swh_content_mimetype_missing();
+
+drop function swh_content_mimetype_get();
+drop type content_mimetype_signature;
+
+drop function swh_mktemp_content_language_missing();
+drop function swh_content_language_missing();
+
+drop function swh_content_language_get();
+drop type content_language_signature;
+
+drop function swh_mktemp_content_ctags_missing();
+drop function swh_content_ctags_missing();
+
+drop function swh_content_ctags_get();
+--drop type content_ctags_signature; -- still used in swh_content_ctags_search
+
+drop function swh_content_fossology_license_get();
+drop type content_fossology_license_signature;
+
+drop function swh_mktemp_content_metadata_missing();
+drop function swh_content_metadata_missing();
+
+drop function swh_content_metadata_get();
+drop type content_metadata_signature;
+
+drop function swh_mktemp_revision_metadata_missing();
+drop function swh_revision_metadata_missing();
+
+drop function swh_revision_metadata_get();
+drop type revision_metadata_signature;
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -95,20 +95,16 @@
Args:
mimetypes (iterable): iterable of dict with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- iterable: an iterable of missing id for the triplets id, tool_name,
- tool_version
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_mimetype_missing(cur)
- db.copy_to(mimetypes, 'tmp_content_mimetype_missing',
- ['id', 'indexer_configuration_id'],
- cur)
- for obj in db.content_mimetype_missing_from_temp(cur):
+ for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
yield obj[0]
@db_transaction()
@@ -119,13 +115,14 @@
Args:
mimetypes (iterable): dictionaries with keys:
- - id (bytes): sha1 identifier
- - mimetype (bytes): raw content's mimetype
- - encoding (bytes): raw content's encoding
- - indexer_configuration_id (int): tool's id used to
- compute the results
- - conflict_update: Flag to determine if we want to
- overwrite (true) or skip duplicates (false, the default)
+ id (bytes): sha1 identifier
+ mimetype (bytes): raw content's mimetype
+ encoding (bytes): raw content's encoding
+ indexer_configuration_id (int): tool's id used to
+ compute the results
+ conflict_update (bool): Flag to determine if we want to
+ overwrite (true) or skip duplicates
+ (false, the default)
"""
db.mktemp_content_mimetype(cur)
@@ -136,8 +133,21 @@
@db_transaction_generator()
def content_mimetype_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.content_mimetype_get_from_temp():
+ """Retrieve full content mimetype per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ mimetypes (iterable): dictionaries with keys:
+
+ id (bytes): sha1 identifier
+ mimetype (bytes): raw content's mimetype
+ encoding (bytes): raw content's encoding
+ tool (dict): Tool used to compute the language
+
+ """
+ for c in db.content_mimetype_get_from_list(ids, cur):
yield converters.db_to_mimetype(
dict(zip(db.content_mimetype_cols, c)))
@@ -148,24 +158,34 @@
Args:
languages (iterable): dictionaries with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- iterable: identifiers of missing languages
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_language_missing(cur)
- db.copy_to(languages, 'tmp_content_language_missing',
- ['id', 'indexer_configuration_id'], cur)
- for obj in db.content_language_missing_from_temp(cur):
+ for obj in db.content_language_missing_from_list(languages, cur):
yield obj[0]
@db_transaction_generator()
def content_language_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.content_language_get_from_temp():
+ """Retrieve full content language per ids.
+
+ Args:
+ ids (iterable): sha1 identifier
+
+ Yields:
+ languages (iterable): dictionaries with keys:
+
+ id (bytes): sha1 identifier
+ lang (bytes): raw content's language
+ tool (dict): Tool used to compute the language
+
+ """
+ for c in db.content_language_get_from_list(ids, cur):
yield converters.db_to_language(
dict(zip(db.content_language_cols, c)))
@@ -177,11 +197,12 @@
Args:
languages (iterable): dictionaries with keys:
- - id: sha1
- - lang: bytes
+ id (bytes): sha1
+ lang (bytes): language detected
- conflict_update: Flag to determine if we want to overwrite (true)
- or skip duplicates (false, the default)
+ conflict_update (bool): Flag to determine if we want to
+ overwrite (true) or skip duplicates (false, the
+ default)
"""
db.mktemp_content_language(cur)
@@ -204,20 +225,16 @@
Args:
ctags (iterable): dicts with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool name used
- - tool_version (str): associated version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- an iterable of missing id
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_ctags_missing(cur)
- db.copy_to(ctags,
- tblname='tmp_content_ctags_missing',
- columns=['id', 'indexer_configuration_id'],
- cur=cur)
- for obj in db.content_ctags_missing_from_temp(cur):
+ for obj in db.content_ctags_missing_from_list(ctags, cur):
yield obj[0]
@db_transaction_generator()
@@ -227,9 +244,18 @@
Args:
ids (iterable): sha1 checksums
+ Yields:
+ Dictionaries with keys:
+
+ id (bytes): content's identifier
+ name (str): symbol's name
+ kind (str): symbol's kind
+ language (str): language for that content
+ tool (dict): tool used to compute the ctags' info
+
+
"""
- db.store_tmp_bytea(ids, cur)
- for c in db.content_ctags_get_from_temp():
+ for c in db.content_ctags_get_from_list(ids, cur):
yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
@db_transaction()
@@ -240,9 +266,9 @@
Args:
ctags (iterable): dictionaries with keys:
- - id (bytes): sha1
- - ctags ([list): List of dictionary with keys: name, kind,
- line, language
+ id (bytes): sha1
+ ctags ([list): List of dictionary with keys: name, kind,
+ line, language
"""
def _convert_ctags(__ctags):
@@ -289,13 +315,12 @@
Yields:
list: dictionaries with the following keys:
- - id (bytes)
- - licenses ([str]): associated licenses for that content
+ id (bytes)
+ licenses ([str]): associated licenses for that content
+ tool (dict): Tool used to compute the license
"""
- db.store_tmp_bytea(ids, cur)
-
- for c in db.content_fossology_license_get_from_temp():
+ for c in db.content_fossology_license_get_from_list(ids, cur):
license = dict(zip(db.content_fossology_license_cols, c))
yield converters.db_to_fossology_license(license)
@@ -339,24 +364,34 @@
Args:
metadatas (iterable): dictionaries with keys:
- - id (bytes): sha1 identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1 identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
- Returns:
- iterable: missing ids
+ Yields:
+ an iterable of missing id for the tuple (id,
+ indexer_configuration_id)
"""
- db.mktemp_content_metadata_missing(cur)
- db.copy_to(metadatas, 'tmp_content_metadata_missing',
- ['id', 'indexer_configuration_id'], cur)
- for obj in db.content_metadata_missing_from_temp(cur):
+ for obj in db.content_metadata_missing_from_list(metadatas, cur):
yield obj[0]
@db_transaction_generator()
def content_metadata_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.content_metadata_get_from_temp():
+ """Retrieve metadata per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ id (bytes)
+ translated_metadata (str): associated metadata
+ tool (dict): tool used to compute metadata
+
+ """
+ for c in db.content_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.content_metadata_cols, c)))
@@ -368,8 +403,8 @@
Args:
metadatas (iterable): dictionaries with keys:
- - id: sha1
- - translated_metadata: bytes / jsonb ?
+ id: sha1
+ translated_metadata: bytes / jsonb ?
conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)
@@ -390,24 +425,33 @@
Args:
metadatas (iterable): dictionaries with keys:
- - id (bytes): sha1_git revision identifier
- - tool_name (str): tool used to compute the results
- - tool_version (str): associated tool's version
+ id (bytes): sha1_git revision identifier
+ indexer_configuration_id (int): tool used to compute
+ the results
Returns:
iterable: missing ids
"""
- db.mktemp_revision_metadata_missing(cur)
- db.copy_to(metadatas, 'tmp_revision_metadata_missing',
- ['id', 'indexer_configuration_id'], cur)
- for obj in db.revision_metadata_missing_from_temp(cur):
+ for obj in db.revision_metadata_missing_from_list(metadatas, cur):
yield obj[0]
@db_transaction_generator()
def revision_metadata_get(self, ids, db=None, cur=None):
- db.store_tmp_bytea(ids, cur)
- for c in db.revision_metadata_get_from_temp():
+ """Retrieve revision metadata per id.
+
+ Args:
+ ids (iterable): sha1 checksums
+
+ Yields:
+ list: dictionaries with the following keys:
+
+ id (bytes)
+ translated_metadata (str): associated metadata
+ tool (dict): tool used to compute metadata
+
+ """
+ for c in db.revision_metadata_get_from_list(ids, cur):
yield converters.db_to_metadata(
dict(zip(db.revision_metadata_cols, c)))
@@ -434,54 +478,6 @@
cur)
db.revision_metadata_add_from_temp(conflict_update, cur)
- @db_transaction()
- def origin_metadata_add(self, origin_id, ts, provider, tool, metadata,
- db=None, cur=None):
- """ Add an origin_metadata for the origin at ts with provenance and
- metadata.
-
- Args:
- origin_id (int): the origin's id for which the metadata is added
- ts (datetime): timestamp of the found metadata
- provider (int): the provider of metadata (ex:'hal')
- tool (int): tool used to extract metadata
- metadata (jsonb): the metadata retrieved at the time and location
-
- Returns:
- id (int): the origin_metadata unique id
- """
- if isinstance(ts, str):
- ts = dateutil.parser.parse(ts)
-
- return db.origin_metadata_add(origin_id, ts, provider, tool,
- metadata, cur)
-
- @db_transaction_generator()
- def origin_metadata_get_by(self, origin_id, provider_type=None, db=None,
- cur=None):
- """Retrieve list of all origin_metadata entries for the origin_id
-
- Args:
- origin_id (int): the unique origin identifier
- provider_type (str): (optional) type of provider
-
- Returns:
- list of dicts: the origin_metadata dictionary with the keys:
-
- - id (int): origin_metadata's id
- - origin_id (int): origin's id
- - discovery_date (datetime): timestamp of discovery
- - tool_id (int): metadata's extracting tool
- - metadata (jsonb)
- - provider_id (int): metadata's provider
- - provider_name (str)
- - provider_type (str)
- - provider_url (str)
-
- """
- for line in db.origin_metadata_get_by(origin_id, provider_type, cur):
- yield dict(zip(db.origin_metadata_get_cols, line))
-
@db_transaction_generator()
def indexer_configuration_add(self, tools, db=None, cur=None):
"""Add new tools to the storage.
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2017 The Software Heritage developers
+# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,7 +6,7 @@
from swh.model import hashutil
from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes
-from swh.storage.db import line_to_bytes
+from swh.storage.db import line_to_bytes, execute_values_to_bytes
class Db(BaseDb):
@@ -24,20 +24,33 @@
self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea',
['id'], cur)
- content_mimetype_cols = [
- 'id', 'mimetype', 'encoding',
- 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
-
- @stored_procedure('swh_mktemp_content_mimetype_missing')
- def mktemp_content_mimetype_missing(self, cur=None): pass
+ content_mimetype_hash_keys = ['id', 'indexer_configuration_id']
- def content_mimetype_missing_from_temp(self, cur=None):
+ def content_mimetype_missing_from_list(self, mimetypes, cur=None):
"""List missing mimetypes.
"""
cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_mimetype_missing()")
- yield from cursor_to_bytes(cur)
+ keys = ', '.join(self.content_mimetype_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_mimetype_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_mimetype c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(m[k] for k in self.content_mimetype_hash_keys)
+ for m in mimetypes)
+ )
+
+ content_mimetype_cols = [
+ 'id', 'mimetype', 'encoding',
+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
@stored_procedure('swh_mktemp_content_mimetype')
def mktemp_content_mimetype(self, cur=None): pass
@@ -46,12 +59,70 @@
self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)",
(conflict_update, ))
- def content_mimetype_get_from_temp(self, cur=None):
+ def _convert_key(self, key, main_table='c'):
+ """Convert keys according to specific use in the module.
+ Args:
+ key (str): Key expression to change according to the alias
+ used in the query
+ main_table (str): Alias to use for the main table. Default
+ to c for content_{something}.
+
+ Expected:
+ Tables content_{something} being aliased as 'c' (something
+ in {language, mimetype, ...}), table indexer_configuration
+ being aliased as 'i'.
+
+ """
+ if key == 'id':
+ return '%s.id' % main_table
+ elif key == 'tool_id':
+ return 'i.id as tool_id'
+ elif key == 'licenses':
+ return '''
+ array(select name
+ from fossology_license
+ where id = ANY(
+ array_agg(%s.license_id))) as licenses''' % main_table
+ return key
+
+ def content_mimetype_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_mimetype_get()" % (
- ','.join(self.content_mimetype_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_mimetype_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_mimetype c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_language_hash_keys = ['id', 'indexer_configuration_id']
+
+ def content_language_missing_from_list(self, languages, cur=None):
+ """List missing languages.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_language_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_language_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_language c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(l[k] for k in self.content_language_hash_keys)
+ for l in languages)
+ )
content_language_cols = [
'id', 'lang',
@@ -60,27 +131,48 @@
@stored_procedure('swh_mktemp_content_language')
def mktemp_content_language(self, cur=None): pass
- @stored_procedure('swh_mktemp_content_language_missing')
- def mktemp_content_language_missing(self, cur=None): pass
-
- def content_language_missing_from_temp(self, cur=None):
- """List missing languages.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_language_missing()")
- yield from cursor_to_bytes(cur)
-
def content_language_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_language_add(%s)",
(conflict_update, ))
- def content_language_get_from_temp(self, cur=None):
+ def content_language_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_language_get()" % (
- ','.join(self.content_language_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_language_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_language c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_ctags_hash_keys = ['id', 'indexer_configuration_id']
+
+ def content_ctags_missing_from_list(self, ctags, cur=None):
+ """List missing ctags.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_ctags_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_ctags_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_ctags c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(c[k] for k in self.content_ctags_hash_keys)
+ for c in ctags)
+ )
content_ctags_cols = [
'id', 'name', 'kind', 'line', 'lang',
@@ -89,27 +181,24 @@
@stored_procedure('swh_mktemp_content_ctags')
def mktemp_content_ctags(self, cur=None): pass
- @stored_procedure('swh_mktemp_content_ctags_missing')
- def mktemp_content_ctags_missing(self, cur=None): pass
-
- def content_ctags_missing_from_temp(self, cur=None):
- """List missing ctags.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_ctags_missing()")
- yield from cursor_to_bytes(cur)
-
def content_ctags_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)",
(conflict_update, ))
- def content_ctags_get_from_temp(self, cur=None):
+ def content_ctags_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_ctags_get()" % (
- ','.join(self.content_ctags_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_ctags_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_ctags c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
def content_ctags_search(self, expression, last_sha1, limit, cur=None):
cur = self._cursor(cur)
@@ -147,15 +236,48 @@
"SELECT swh_content_fossology_license_add(%s)",
(conflict_update, ))
- def content_fossology_license_get_from_temp(self, cur=None):
- """Retrieve licenses per content.
+ def content_fossology_license_get_from_list(self, ids, cur=None):
+ """Retrieve licenses per id.
"""
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_fossology_license_get()" % (
- ','.join(self.content_fossology_license_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_fossology_license_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_fossology_license c on t.id=c.id
+ inner join indexer_configuration i
+ on i.id=c.indexer_configuration_id
+ group by c.id, i.id, i.tool_name, i.tool_version,
+ i.tool_configuration;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_metadata_hash_keys = ['id', 'indexer_configuration_id']
+
+ def content_metadata_missing_from_list(self, metadata, cur=None):
+ """List missing metadata.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_metadata_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = c.%s' % (key, key))
+ for key in self.content_metadata_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from content_metadata c
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(m[k] for k in self.content_metadata_hash_keys)
+ for m in metadata)
+ )
content_metadata_cols = [
'id', 'translated_metadata',
@@ -164,27 +286,48 @@
@stored_procedure('swh_mktemp_content_metadata')
def mktemp_content_metadata(self, cur=None): pass
- @stored_procedure('swh_mktemp_content_metadata_missing')
- def mktemp_content_metadata_missing(self, cur=None): pass
-
- def content_metadata_missing_from_temp(self, cur=None):
- """List missing metadatas.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_content_metadata_missing()")
- yield from cursor_to_bytes(cur)
-
def content_metadata_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
(conflict_update, ))
- def content_metadata_get_from_temp(self, cur=None):
+ def content_metadata_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_content_metadata_get()" % (
- ','.join(self.content_metadata_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(self._convert_key, self.content_metadata_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join content_metadata c
+ on c.id=t.id
+ inner join indexer_configuration i
+ on c.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
+
+ content_revision_metadata_hash_keys = ['id', 'indexer_configuration_id']
+
+ def revision_metadata_missing_from_list(self, metadata, cur=None):
+ """List missing metadata.
+
+ """
+ cur = self._cursor(cur)
+ keys = ', '.join(self.content_revision_metadata_hash_keys)
+ equality = ' AND '.join(
+ ('t.%s = r.%s' % (key, key))
+ for key in self.content_revision_metadata_hash_keys
+ )
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s from (values %%s) as t(%s)
+ where not exists (
+ select 1 from revision_metadata r
+ where %s
+ )
+ """ % (keys, keys, equality),
+ (tuple(m[k] for k in self.content_revision_metadata_hash_keys)
+ for m in metadata)
+ )
revision_metadata_cols = [
'id', 'translated_metadata',
@@ -193,27 +336,25 @@
@stored_procedure('swh_mktemp_revision_metadata')
def mktemp_revision_metadata(self, cur=None): pass
- @stored_procedure('swh_mktemp_revision_metadata_missing')
- def mktemp_revision_metadata_missing(self, cur=None): pass
-
- def revision_metadata_missing_from_temp(self, cur=None):
- """List missing metadatas.
-
- """
- cur = self._cursor(cur)
- cur.execute("SELECT * FROM swh_revision_metadata_missing()")
- yield from cursor_to_bytes(cur)
-
def revision_metadata_add_from_temp(self, conflict_update, cur=None):
self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
(conflict_update, ))
- def revision_metadata_get_from_temp(self, cur=None):
+ def revision_metadata_get_from_list(self, ids, cur=None):
cur = self._cursor(cur)
- query = "SELECT %s FROM swh_revision_metadata_get()" % (
- ','.join(self.revision_metadata_cols))
- cur.execute(query)
- yield from cursor_to_bytes(cur)
+ keys = map(lambda k: self._convert_key(k, main_table='r'),
+ self.revision_metadata_cols)
+ yield from execute_values_to_bytes(
+ cur, """
+ select %s
+ from (values %%s) as t(id)
+ inner join revision_metadata r
+ on r.id=t.id
+ inner join indexer_configuration i
+ on r.indexer_configuration_id=i.id;
+ """ % ', '.join(keys),
+ ((_id,) for _id in ids)
+ )
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 6:30 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231595

Event Timeline