Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/db.py
# Copyright (C) 2015-2019 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import random | |||||
import select | import select | ||||
from swh.core.db import BaseDb | from swh.core.db import BaseDb | ||||
from swh.core.db.db_utils import stored_procedure, jsonize | from swh.core.db.db_utils import stored_procedure, jsonize | ||||
from swh.core.db.db_utils import execute_values_generator | from swh.core.db.db_utils import execute_values_generator | ||||
from swh.model.model import SHA1_SIZE | |||||
class Db(BaseDb): | class Db(BaseDb): | ||||
"""Proxy to the SWH DB, with wrappers around stored procedures | """Proxy to the SWH DB, with wrappers around stored procedures | ||||
""" | """ | ||||
def mktemp_dir_entry(self, entry_type, cur=None): | def mktemp_dir_entry(self, entry_type, cur=None): | ||||
▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines | def snapshot_get_by_origin_visit(self, origin_url, visit_id, cur=None): | ||||
WHERE origin.url=%s AND origin_visit.visit=%s; | WHERE origin.url=%s AND origin_visit.visit=%s; | ||||
""" | """ | ||||
cur.execute(query, (origin_url, visit_id)) | cur.execute(query, (origin_url, visit_id)) | ||||
ret = cur.fetchone() | ret = cur.fetchone() | ||||
if ret: | if ret: | ||||
return ret[0] | return ret[0] | ||||
def snapshot_get_random(self, cur=None): | |||||
return self._get_random_row_from_table( | |||||
'snapshot', ['id'], 'id', cur) | |||||
content_find_cols = ['sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', | content_find_cols = ['sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', | ||||
'ctime', 'status'] | 'ctime', 'status'] | ||||
def content_find(self, sha1=None, sha1_git=None, sha256=None, | def content_find(self, sha1=None, sha1_git=None, sha256=None, | ||||
blake2s256=None, cur=None): | blake2s256=None, cur=None): | ||||
"""Find the content optionally on a combination of the following | """Find the content optionally on a combination of the following | ||||
checksums sha1, sha1_git, sha256 or blake2s256. | checksums sha1, sha1_git, sha256 or blake2s256. | ||||
Show All 22 Lines | def content_find(self, sha1=None, sha1_git=None, sha256=None, | ||||
cur.execute("""SELECT %s | cur.execute("""SELECT %s | ||||
FROM content WHERE %s | FROM content WHERE %s | ||||
""" | """ | ||||
% (','.join(self.content_find_cols), query), | % (','.join(self.content_find_cols), query), | ||||
args) | args) | ||||
content = cur.fetchall() | content = cur.fetchall() | ||||
return content | return content | ||||
def content_get_random(self, cur=None): | |||||
return self._get_random_row_from_table( | |||||
'content', ['sha1_git'], 'sha1_git', cur) | |||||
def directory_missing_from_list(self, directories, cur=None): | def directory_missing_from_list(self, directories, cur=None): | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
yield from execute_values_generator( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
SELECT id FROM (VALUES %s) as t(id) | SELECT id FROM (VALUES %s) as t(id) | ||||
WHERE NOT EXISTS ( | WHERE NOT EXISTS ( | ||||
SELECT 1 FROM directory d WHERE d.id = t.id | SELECT 1 FROM directory d WHERE d.id = t.id | ||||
) | ) | ||||
Show All 27 Lines | def directory_entry_get_by_path(self, directory, paths, cur=None): | ||||
'SELECT %s FROM swh_find_directory_entry_by_path(%%s, %%s)' % cols) | 'SELECT %s FROM swh_find_directory_entry_by_path(%%s, %%s)' % cols) | ||||
cur.execute(query, (directory, paths)) | cur.execute(query, (directory, paths)) | ||||
data = cur.fetchone() | data = cur.fetchone() | ||||
if set(data) == {None}: | if set(data) == {None}: | ||||
return None | return None | ||||
return data | return data | ||||
def directory_get_random(self, cur=None): | |||||
return self._get_random_row_from_table( | |||||
'directory', ['id'], 'id', cur) | |||||
def revision_missing_from_list(self, revisions, cur=None): | def revision_missing_from_list(self, revisions, cur=None): | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
yield from execute_values_generator( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
SELECT id FROM (VALUES %s) as t(id) | SELECT id FROM (VALUES %s) as t(id) | ||||
WHERE NOT EXISTS ( | WHERE NOT EXISTS ( | ||||
SELECT 1 FROM revision r WHERE r.id = t.id | SELECT 1 FROM revision r WHERE r.id = t.id | ||||
▲ Show 20 Lines • Show All 281 Lines • ▼ Show 20 Lines | def revision_shortlog(self, root_revisions, limit=None, cur=None): | ||||
query = """SELECT %s | query = """SELECT %s | ||||
FROM swh_revision_list(%%s, %%s) | FROM swh_revision_list(%%s, %%s) | ||||
""" % ', '.join(self.revision_shortlog_cols) | """ % ', '.join(self.revision_shortlog_cols) | ||||
cur.execute(query, (root_revisions, limit)) | cur.execute(query, (root_revisions, limit)) | ||||
yield from cur | yield from cur | ||||
def revision_get_random(self, cur=None): | |||||
return self._get_random_row_from_table( | |||||
'revision', ['id'], 'id', cur) | |||||
def release_missing_from_list(self, releases, cur=None): | def release_missing_from_list(self, releases, cur=None): | ||||
cur = self._cursor(cur) | cur = self._cursor(cur) | ||||
yield from execute_values_generator( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
SELECT id FROM (VALUES %s) as t(id) | SELECT id FROM (VALUES %s) as t(id) | ||||
WHERE NOT EXISTS ( | WHERE NOT EXISTS ( | ||||
SELECT 1 FROM release r WHERE r.id = t.id | SELECT 1 FROM release r WHERE r.id = t.id | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 212 Lines • ▼ Show 20 Lines | def release_get_from_list(self, releases, cur=None): | ||||
yield from execute_values_generator( | yield from execute_values_generator( | ||||
cur, """ | cur, """ | ||||
SELECT %s FROM (VALUES %%s) as t(id) | SELECT %s FROM (VALUES %%s) as t(id) | ||||
LEFT JOIN release ON t.id = release.id | LEFT JOIN release ON t.id = release.id | ||||
LEFT JOIN person author ON release.author = author.id | LEFT JOIN person author ON release.author = author.id | ||||
""" % query_keys, | """ % query_keys, | ||||
((id,) for id in releases)) | ((id,) for id in releases)) | ||||
def release_get_random(self, cur=None): | |||||
return self._get_random_row_from_table( | |||||
'release', ['id'], 'id', cur) | |||||
def origin_metadata_add(self, origin, ts, provider, tool, | def origin_metadata_add(self, origin, ts, provider, tool, | ||||
metadata, cur=None): | metadata, cur=None): | ||||
""" Add an origin_metadata for the origin at ts with provider, tool and | """ Add an origin_metadata for the origin at ts with provider, tool and | ||||
metadata. | metadata. | ||||
Args: | Args: | ||||
origin (int): the origin's id for which the metadata is added | origin (int): the origin's id for which the metadata is added | ||||
ts (datetime): time when the metadata was found | ts (datetime): time when the metadata was found | ||||
▲ Show 20 Lines • Show All 94 Lines • ▼ Show 20 Lines | def metadata_provider_get_by(self, provider_name, provider_url, | ||||
cur.execute('''select %s | cur.execute('''select %s | ||||
from metadata_provider | from metadata_provider | ||||
where provider_name=%%s and | where provider_name=%%s and | ||||
provider_url=%%s''' % ( | provider_url=%%s''' % ( | ||||
','.join(self.metadata_provider_cols)), | ','.join(self.metadata_provider_cols)), | ||||
(provider_name, provider_url)) | (provider_name, provider_url)) | ||||
return cur.fetchone() | return cur.fetchone() | ||||
def _get_random_row_from_table(self, table_name, cols, id_col, cur=None): | |||||
random_sha1 = bytes(random.randint(0, 255) for _ in range(SHA1_SIZE)) | |||||
cur = self._cursor(cur) | |||||
query = ''' | |||||
(SELECT {cols} FROM {table} WHERE {id_col} >= %s | |||||
ORDER BY {id_col} LIMIT 1) | |||||
UNION | |||||
(SELECT {cols} FROM {table} WHERE {id_col} < %s | |||||
ORDER BY {id_col} DESC LIMIT 1) | |||||
LIMIT 1 | |||||
'''.format(cols=', '.join(cols), table=table_name, id_col=id_col) | |||||
cur.execute(query, (random_sha1, random_sha1)) | |||||
row = cur.fetchone() | |||||
if row: | |||||
return row[0] |