diff --git a/README b/README index d9080ac..44e401c 100644 --- a/README +++ b/README @@ -1,65 +1,66 @@ swh-git-cloner ============== Clone git repositories to infinity and beyond! Configuration file for the producer (cloner.py): ~/.config/swh/cloner.ini [main] # table where to look for repositories to clone table_clones = sample # Number of rows to return # Limit used to retrieve data from 'table_clones' + # default to None # limit = 10 - # the nth page when limit is specified - # should not be populated if limit not set - # page 5 so from the rows (5*10)-(5+1)*10=50-60 - # page = 5 + # the offset from which to start fetching data + # offset 5 so from the row number 5, fetch `limit` + # default to None + # offset = 5 # logging directory log_dir = swh-git-cloner/log/ # url access to db db_url = host= port= dbname= user= password= # Repository scheme to build git, svn, etc, ... # ex: http://github.com/%s, https://github.com/%s, git://github.com/%s, etc... # %s represents the fullname's repository. # For example, in github, / repository_scheme = git://github.com/%s Configuration file for the consumer worker-cloner: ~/.config/swh/worker-cloner.ini: [main] # mount folder where to dump the clones mount=/tmp/swh-git-cloner # witness file built after the cloning. It's a witness of a success clone witness-file-name=witness # the ssh access key to the ssh-host ssh-access-command=ssh -i /some/path/.ssh/key-rsync -l user # ssh host to rsync to ssh-host= # destination folder on that host # ssh-host-destination-folder= # url access to db db_url = host= port= dbname= user= password= # max file size limit allowed to git clone, in bytes (default to 4G) clone_limit_size = 4294967296 # the queue url to access for consuming tasks # queue_url = amqp://guest:guest@localhost:5672// queue_url = amqp://guest:guest@192.168.100.31:5672// # soft time limit for a task, if exceeded, the worker cleans up # and stops task_soft_time_limit = 3600 diff --git a/bin/swh-git-producer b/bin/swh-git-producer index 49aef07..8d080a2 100755 --- a/bin/swh-git-producer +++ b/bin/swh-git-producer @@ -1,70 +1,70 @@ #!/usr/bin/env python3 # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import configparser import logging import os from swh.producer import clones # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/clones-producer.ini' # default configuration (can be overriden by the DEFAULT_CONF_FILE) DEFAULT_CONF = { 'table_clones': 'sample', 'limit': None, - 'page': None, + 'offset': None, 'log_dir': 'swh-git-cloner/log/', 'db_url': 'dbname=github', 'repository_scheme': 'https://github.com/%s', 'debug': None } def parse_args(): """Parse the configuration for the cli. """ cli = argparse.ArgumentParser( description='Clone git repository on fs.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args def read_conf(args): """Read the user's configuration file. args contains the repo to parse. Transmit to the result. """ config = configparser.ConfigParser(defaults=DEFAULT_CONF) conf_file = args.config or DEFAULT_CONF_FILE config.read(os.path.expanduser(conf_file)) conf = config._sections['main'] # ensure the default keys are set if some are missing for key in DEFAULT_CONF: conf[key] = conf.get(key, DEFAULT_CONF[key]) return conf if __name__ == '__main__': args = parse_args() conf = read_conf(args) log_filename = os.path.join(conf['log_dir'], 'cloner.log') logging.basicConfig(filename=log_filename, level=logging.DEBUG if args.verbose else logging.INFO) clones.produce(conf) diff --git a/swh/producer/clones.py b/swh/producer/clones.py index eb40d1a..9dddec6 100644 --- a/swh/producer/clones.py +++ b/swh/producer/clones.py @@ -1,32 +1,32 @@ # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.storage import db, models from swh.worker import tasks def produce(conf): """Make workers clone repositories. """ db_url = conf['db_url'] table_clones = conf['table_clones'] limit = conf['limit'] - page = conf['page'] + offset = conf['offset'] repository_scheme = conf['repository_scheme'] with db.connect(db_url) as db_conn: - repos = models.load_random_sample(db_conn, table_clones, limit, page) + repos = models.load_random_sample(db_conn, table_clones, limit, offset) for (repo_id, repo_name) in repos: logging.info('load repo %s into queue' % repo_name) repo_url = repository_scheme % repo_name model_data = {'repo_id': repo_id, 'repo_url': repo_url, 'repo_name': repo_name} tasks.orchestrate_clone_with_measure.delay(model_data) diff --git a/swh/storage/models.py b/swh/storage/models.py index b9e35ac..b8314fd 100644 --- a/swh/storage/models.py +++ b/swh/storage/models.py @@ -1,63 +1,63 @@ # Copyright (C) 2015 Stefano Zacchiroli , # Antoine R. Dumont # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.storage import db def load_repos(db_conn, limit=10): """List the repository. Limit the number of repository to load. 10 is the default if no limit is provided. NOT USED YET?! """ return db.query_fetch(db_conn, ("""SELECT id, full_name FROM orig_repos LIMIT %s""", (limit,))) def load_random_1_percent_repos(db_conn): """Load the 1 percent repositories at random. NOT USED YET?! """ return db.query_fetch(db_conn, """SELECT id, full_name FROM repos_random_sample(1) UNION ALL SELECT id, full_name, html_url FROM repos_well_known();""") -def load_random_sample(db_conn, table='sample', limit=None, page=None): +def load_random_sample(db_conn, table='sample', limit=None, offset=None): """Load the table sample containing random sample of repositories to fetch. """ if limit: query_limit = ' order by id limit ' + limit - query_limit += '' if not page else ' offset ' + str(int(page) * int(limit)) + query_limit += '' if not offset else ' offset ' + offset else: query_limit = '' query = 'SELECT id, full_name FROM ' + table + query_limit return db.query_fetch(db_conn, query) def persist_task_result(db_conn, repo_id, task_id, task_start_date, task_duration, status, json_result, stdout, stderr): """Persist the task's result. """ return db.query_execute(db_conn, ("""INSERT INTO crawl_history (repo, task_id, date, duration, status, result, stdout, stderr) VALUES(%s, %s, %s, %s, %s, %s, %s, %s) """, (repo_id, task_id, task_start_date, task_duration, status, json_result, stdout, stderr)))