diff --git a/swh/cloner/git/storage/db.py b/swh/cloner/git/storage/db.py index c933d55..378090c 100644 --- a/swh/cloner/git/storage/db.py +++ b/swh/cloner/git/storage/db.py @@ -1,77 +1,63 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import psycopg2 def connect(db_url): """Open db connection. """ return psycopg2.connect(db_url) -def execute(cur, query_params, trace=None): +def entry_to_bytes(entry): + """Convert an entry coming from the database to bytes""" + if isinstance(entry, memoryview): + return entry.tobytes() + return entry + + +def line_to_bytes(line): + """Convert a line coming from the database to bytes""" + return line.__class__(entry_to_bytes(entry) for entry in line) + + +def cursor_to_bytes(cursor): + """Yield all the data from a cursor as bytes""" + yield from (line_to_bytes(line) for line in cursor) + + +def execute(cur, query_params): """Execute the query_params. query_params is expected to be either: - a sql query (string) - a tuple (sql query, params) """ if isinstance(query_params, str): cur.execute(query_params) else: - if trace is not None: - print("mogrify: ", cur.mogrify(*query_params).decode()) cur.execute(*query_params) -def copy_from(cur, file, table): - """Copy the content of a file to the db in the table table. - """ - cur.copy_from(file, table) - - def query_execute(db_conn, query_params): """Execute one query. Type of sql queries: insert, delete, drop, create... query_params is expected to be either: - a sql query (string) - a tuple (sql query, params) """ with db_conn.cursor() as cur: execute(cur, query_params) -def queries_execute(db_conn, queries_params, trace=None): - """Execute multiple queries without any result expected. - Type of sql queries: insert, delete, drop, create... - query_params is expected to be a list of mixed: - - sql query (string) - - tuple (sql query, params) - """ - with db_conn.cursor() as cur: - for query_params in queries_params: - execute(cur, query_params, trace) - - -def query_fetchone(db_conn, query_params): - """Execute sql query which returns one result. - query_params is expected to be either: - - a sql query (string) - - a tuple (sql query, params) - """ - with db_conn.cursor() as cur: - execute(cur, query_params) - return cur.fetchone() - - -def query_fetch(db_conn, query_params, trace=None): +def query_fetch(db_conn, query_params): """Execute sql query which returns results. query_params is expected to be either: - a sql query (string) - a tuple (sql query, params) """ with db_conn.cursor() as cur: - execute(cur, query_params, trace) - return cur.fetchall() + execute(cur, query_params) + yield from cursor_to_bytes(cur) diff --git a/swh/cloner/git/storage/models.py b/swh/cloner/git/storage/models.py index e30cffa..128df53 100644 --- a/swh/cloner/git/storage/models.py +++ b/swh/cloner/git/storage/models.py @@ -1,62 +1,33 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.cloner.git.storage import db -def load_repos(db_conn, limit=10): +def load_repos(db_conn): """List the repository. Limit the number of repository to load. - 10 is the default if no limit is provided. - NOT USED YET?! """ - return db.query_fetch(db_conn, ("""SELECT id, full_name - FROM orig_repos - LIMIT %s""", - (limit,))) - - -def load_random_1_percent_repos(db_conn): - """Load the 1 percent repositories at random. - NOT USED YET?! - """ - return db.query_fetch(db_conn, """SELECT id, full_name - FROM repos_random_sample(1) - UNION ALL - SELECT id, full_name, html_url - FROM repos_well_known();""") - - -def load_random_sample(db_conn, table='sample', limit=None, offset=None): - """Load the table sample containing random sample of repositories to fetch. - """ - if limit: - query_limit = ' order by id limit ' + limit - query_limit += '' if not offset else ' offset ' + offset - else: - query_limit = '' - - query = 'SELECT id, full_name FROM ' + table + query_limit - return db.query_fetch(db_conn, query) + yield from db.query_fetch(db_conn, 'select id, full_name from missing_orig_repos order by id') def persist_task_result(db_conn, repo_id, task_id, task_start_date, task_duration, status, json_result, stdout, stderr): """Persist the task's result. """ return db.query_execute(db_conn, ("""INSERT INTO crawl_history (repo, task_id, date, duration, status, result, stdout, stderr) VALUES(%s, %s, %s, %s, %s, %s, %s, %s) """, (repo_id, task_id, task_start_date, task_duration, status, json_result, stdout, stderr)))