Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/storage/db.py
# Copyright (C) 2015-2018 The Software Heritage developers | # Copyright (C) 2015-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes | from swh.core.db import BaseDb | ||||
from swh.storage.db import line_to_bytes, execute_values_to_bytes | from swh.core.db.db_utils import execute_values_generator, stored_procedure | ||||
class Db(BaseDb): | class Db(BaseDb): | ||||
"""Proxy to the SWH Indexer DB, with wrappers around stored procedures | """Proxy to the SWH Indexer DB, with wrappers around stored procedures | ||||
""" | """ | ||||
content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] | content_mimetype_hash_keys = ['id', 'indexer_configuration_id'] | ||||
Show All 10 Lines | def _missing_from_list(self, table, data, hash_keys, cur=None): | ||||
The data which is missing from the db. | The data which is missing from the db. | ||||
""" | """ | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
keys = ', '.join(hash_keys) | keys = ', '.join(hash_keys) | ||||
equality = ' AND '.join( | equality = ' AND '.join( | ||||
('t.%s = c.%s' % (key, key)) for key in hash_keys | ('t.%s = c.%s' % (key, key)) for key in hash_keys | ||||
) | ) | ||||
yield from execute_values_to_bytes( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
select %s from (values %%s) as t(%s) | select %s from (values %%s) as t(%s) | ||||
where not exists ( | where not exists ( | ||||
select 1 from %s c | select 1 from %s c | ||||
where %s | where %s | ||||
) | ) | ||||
""" % (keys, keys, table, equality), | """ % (keys, keys, table, equality), | ||||
(tuple(m[k] for k in hash_keys) for m in data) | (tuple(m[k] for k in hash_keys) for m in data) | ||||
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | def _get_from_list(self, table, ids, cols, cur=None, id_col='id'): | ||||
inner join {table} c | inner join {table} c | ||||
on c.{id_col}=t.id | on c.{id_col}=t.id | ||||
inner join indexer_configuration i | inner join indexer_configuration i | ||||
on c.indexer_configuration_id=i.id; | on c.indexer_configuration_id=i.id; | ||||
""".format( | """.format( | ||||
keys=', '.join(keys), | keys=', '.join(keys), | ||||
id_col=id_col, | id_col=id_col, | ||||
table=table) | table=table) | ||||
yield from execute_values_to_bytes( | yield from execute_values_generator( | ||||
cur, query, | cur, query, | ||||
((_id,) for _id in ids) | ((_id,) for _id in ids) | ||||
) | ) | ||||
content_indexer_names = { | content_indexer_names = { | ||||
'mimetype': 'content_mimetype', | 'mimetype': 'content_mimetype', | ||||
'fossology_license': 'content_fossology_license', | 'fossology_license': 'content_fossology_license', | ||||
} | } | ||||
Show All 21 Lines | def content_get_range(self, content_type, start, end, | ||||
inner join indexer_configuration ic | inner join indexer_configuration ic | ||||
on t.indexer_configuration_id=ic.id | on t.indexer_configuration_id=ic.id | ||||
%s | %s | ||||
where ic.id=%%s and | where ic.id=%%s and | ||||
%%s <= t.id and t.id <= %%s | %%s <= t.id and t.id <= %%s | ||||
order by t.indexer_configuration_id, t.id | order by t.indexer_configuration_id, t.id | ||||
limit %%s""" % (table, extra) | limit %%s""" % (table, extra) | ||||
cur.execute(query, (indexer_configuration_id, start, end, limit)) | cur.execute(query, (indexer_configuration_id, start, end, limit)) | ||||
yield from cursor_to_bytes(cur) | yield from cur | ||||
def content_mimetype_get_from_list(self, ids, cur=None): | def content_mimetype_get_from_list(self, ids, cur=None): | ||||
yield from self._get_from_list( | yield from self._get_from_list( | ||||
'content_mimetype', ids, self.content_mimetype_cols, cur=cur) | 'content_mimetype', ids, self.content_mimetype_cols, cur=cur) | ||||
content_language_hash_keys = ['id', 'indexer_configuration_id'] | content_language_hash_keys = ['id', 'indexer_configuration_id'] | ||||
def content_language_missing_from_list(self, languages, cur=None): | def content_language_missing_from_list(self, languages, cur=None): | ||||
Show All 38 Lines | class Db(BaseDb): | ||||
def content_ctags_add_from_temp(self, conflict_update, cur=None): | def content_ctags_add_from_temp(self, conflict_update, cur=None): | ||||
self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", | self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", | ||||
(conflict_update, )) | (conflict_update, )) | ||||
def content_ctags_get_from_list(self, ids, cur=None): | def content_ctags_get_from_list(self, ids, cur=None): | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
keys = map(self._convert_key, self.content_ctags_cols) | keys = map(self._convert_key, self.content_ctags_cols) | ||||
yield from execute_values_to_bytes( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
select %s | select %s | ||||
from (values %%s) as t(id) | from (values %%s) as t(id) | ||||
inner join content_ctags c | inner join content_ctags c | ||||
on c.id=t.id | on c.id=t.id | ||||
inner join indexer_configuration i | inner join indexer_configuration i | ||||
on c.indexer_configuration_id=i.id | on c.indexer_configuration_id=i.id | ||||
order by line | order by line | ||||
Show All 14 Lines | def content_ctags_search(self, expression, last_sha1, limit, cur=None): | ||||
elif last_sha1: | elif last_sha1: | ||||
last_sha1 = '\\x%s' % last_sha1 | last_sha1 = '\\x%s' % last_sha1 | ||||
query = """SELECT %s | query = """SELECT %s | ||||
FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( | FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( | ||||
','.join(self.content_ctags_cols)) | ','.join(self.content_ctags_cols)) | ||||
cur.execute(query, (expression, limit, last_sha1)) | cur.execute(query, (expression, limit, last_sha1)) | ||||
yield from cursor_to_bytes(cur) | yield from cur | ||||
content_fossology_license_cols = [ | content_fossology_license_cols = [ | ||||
'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', | 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', | ||||
'licenses'] | 'licenses'] | ||||
@stored_procedure('swh_mktemp_content_fossology_license') | @stored_procedure('swh_mktemp_content_fossology_license') | ||||
def mktemp_content_fossology_license(self, cur=None): pass | def mktemp_content_fossology_license(self, cur=None): pass | ||||
def content_fossology_license_add_from_temp(self, conflict_update, | def content_fossology_license_add_from_temp(self, conflict_update, | ||||
cur=None): | cur=None): | ||||
"""Add new licenses per content. | """Add new licenses per content. | ||||
""" | """ | ||||
self._cursor(cur).execute( | self._cursor(cur).execute( | ||||
"SELECT swh_content_fossology_license_add(%s)", | "SELECT swh_content_fossology_license_add(%s)", | ||||
(conflict_update, )) | (conflict_update, )) | ||||
def content_fossology_license_get_from_list(self, ids, cur=None): | def content_fossology_license_get_from_list(self, ids, cur=None): | ||||
"""Retrieve licenses per id. | """Retrieve licenses per id. | ||||
""" | """ | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
keys = map(self._convert_key, self.content_fossology_license_cols) | keys = map(self._convert_key, self.content_fossology_license_cols) | ||||
yield from execute_values_to_bytes( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
select %s | select %s | ||||
from (values %%s) as t(id) | from (values %%s) as t(id) | ||||
inner join content_fossology_license c on t.id=c.id | inner join content_fossology_license c on t.id=c.id | ||||
inner join indexer_configuration i | inner join indexer_configuration i | ||||
on i.id=c.indexer_configuration_id | on i.id=c.indexer_configuration_id | ||||
group by c.id, i.id, i.tool_name, i.tool_version, | group by c.id, i.id, i.tool_name, i.tool_version, | ||||
i.tool_configuration; | i.tool_configuration; | ||||
▲ Show 20 Lines • Show All 92 Lines • ▼ Show 20 Lines | def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, | ||||
"JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true " | "JOIN LATERAL (SELECT {tsquery_template}) AS s(tsq) ON true " | ||||
"WHERE to_tsvector('{regconfig}', metadata) @@ tsq " | "WHERE to_tsvector('{regconfig}', metadata) @@ tsq " | ||||
"ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC " | "ORDER BY ts_rank(oim.metadata_tsvector, tsq, 1) DESC " | ||||
"LIMIT %s;" | "LIMIT %s;" | ||||
).format(keys=', '.join(keys), | ).format(keys=', '.join(keys), | ||||
regconfig=regconfig, | regconfig=regconfig, | ||||
tsquery_template=tsquery_template) | tsquery_template=tsquery_template) | ||||
cur.execute(query, tsquery_args + [limit]) | cur.execute(query, tsquery_args + [limit]) | ||||
yield from cursor_to_bytes(cur) | yield from cur | ||||
indexer_configuration_cols = ['id', 'tool_name', 'tool_version', | indexer_configuration_cols = ['id', 'tool_name', 'tool_version', | ||||
'tool_configuration'] | 'tool_configuration'] | ||||
@stored_procedure('swh_mktemp_indexer_configuration') | @stored_procedure('swh_mktemp_indexer_configuration') | ||||
def mktemp_indexer_configuration(self, cur=None): | def mktemp_indexer_configuration(self, cur=None): | ||||
pass | pass | ||||
def indexer_configuration_add_from_temp(self, cur=None): | def indexer_configuration_add_from_temp(self, cur=None): | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( | cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( | ||||
','.join(self.indexer_configuration_cols), )) | ','.join(self.indexer_configuration_cols), )) | ||||
yield from cursor_to_bytes(cur) | yield from cur | ||||
def indexer_configuration_get(self, tool_name, | def indexer_configuration_get(self, tool_name, | ||||
tool_version, tool_configuration, cur=None): | tool_version, tool_configuration, cur=None): | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
cur.execute('''select %s | cur.execute('''select %s | ||||
from indexer_configuration | from indexer_configuration | ||||
where tool_name=%%s and | where tool_name=%%s and | ||||
tool_version=%%s and | tool_version=%%s and | ||||
tool_configuration=%%s''' % ( | tool_configuration=%%s''' % ( | ||||
','.join(self.indexer_configuration_cols)), | ','.join(self.indexer_configuration_cols)), | ||||
(tool_name, tool_version, tool_configuration)) | (tool_name, tool_version, tool_configuration)) | ||||
data = cur.fetchone() | return cur.fetchone() | ||||
if not data: | |||||
return None | |||||
return line_to_bytes(data) |