diff --git a/README b/README index 44e401c..b841194 100644 --- a/README +++ b/README @@ -1,66 +1,56 @@ swh-git-cloner ============== Clone git repositories to infinity and beyond! -Configuration file for the producer (cloner.py): ~/.config/swh/cloner.ini +Configuration file for the producer (cloner.py): ~/.config/swh/clones-producer.ini [main] - # table where to look for repositories to clone - table_clones = sample - - # Number of rows to return - # Limit used to retrieve data from 'table_clones' - # default to None - # limit = 10 - - # the offset from which to start fetching data - # offset 5 so from the row number 5, fetch `limit` - # default to None - # offset = 5 - # logging directory - log_dir = swh-git-cloner/log/ + log_dir = /tmp/swh/cloner-git/log/ # url access to db db_url = host= port= dbname= user= password= # Repository scheme to build git, svn, etc, ... # ex: http://github.com/%s, https://github.com/%s, git://github.com/%s, etc... # %s represents the fullname's repository. # For example, in github, / repository_scheme = git://github.com/%s + # Do fetch the clones but simply output them on standard output + #dry_run = true + Configuration file for the consumer worker-cloner: ~/.config/swh/worker-cloner.ini: [main] # mount folder where to dump the clones - mount=/tmp/swh-git-cloner + mount=/tmp/swh/cloner-git/ # witness file built after the cloning. It's a witness of a success clone witness-file-name=witness # the ssh access key to the ssh-host ssh-access-command=ssh -i /some/path/.ssh/key-rsync -l user # ssh host to rsync to ssh-host= # destination folder on that host # ssh-host-destination-folder= # url access to db db_url = host= port= dbname= user= password= # max file size limit allowed to git clone, in bytes (default to 4G) clone_limit_size = 4294967296 # the queue url to access for consuming tasks # queue_url = amqp://guest:guest@localhost:5672// queue_url = amqp://guest:guest@192.168.100.31:5672// # soft time limit for a task, if exceeded, the worker cleans up # and stops task_soft_time_limit = 3600 diff --git a/bin/swh-clones-producer b/bin/swh-clones-producer index b332680..fb232ec 100755 --- a/bin/swh-clones-producer +++ b/bin/swh-clones-producer @@ -1,54 +1,55 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import logging import os from swh.core import config from swh.cloner.git.producer import clones # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/clones-producer.ini' # default configuration (can be overriden by the DEFAULT_CONF_FILE) DEFAULT_CONF = { - 'table_clones': ('string', 'sample'), - 'limit': ('int', None), - 'offset': ('int', None), 'log_dir': ('string', 'swh-git-cloner/log/'), 'db_url': ('string', 'dbname=github'), - 'repository_scheme': ('string', 'https://github.com/%s'), - 'debug': ('bool', None) + 'repository_scheme': ('string', 'git://github.com/%s'), + 'dry_run': ('bool', False) } def parse_args(): """Parse the configuration for the cli. """ cli = argparse.ArgumentParser( description='Clone git repository on fs.') cli.add_argument('--verbose', '-v', action='store_true', help='Verbosity level in log file.') + cli.add_argument('--dry-run', '-n', + action='store_true', + help='Dry run (print repo only)') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': - args = parse_args + args = parse_args() conf = config.read(args.config or DEFAULT_CONF_FILE, DEFAULT_CONF) + conf['dry_run'] = args.dry_run config.prepare_folders(conf, 'log_dir') logging.basicConfig(filename=os.path.join(conf['log_dir'], 'cloner.log'), level=logging.DEBUG if args.verbose else logging.INFO) clones.produce(conf) diff --git a/swh/cloner/git/producer/clones.py b/swh/cloner/git/producer/clones.py index bbf82e3..60ea743 100644 --- a/swh/cloner/git/producer/clones.py +++ b/swh/cloner/git/producer/clones.py @@ -1,31 +1,67 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.cloner.git.storage import db, models from swh.cloner.git.worker import tasks +def just_print_repos(repository_scheme, repos): + """Only print what should be posted in queue. + + Args: + repository_scheme: not used + repos: Generator of tuple repository id, repository name to print. + + Returns: + None + + Raises: + None + + """ + for (repo_id, repo_name) in repos: + print('load repo %s into queue' % repo_name) + + +def post_to_task_queue(repository_scheme, repos): + """Load repositories to queue. + + Args: + Generator of repository to print. + + Returns: + None + + Raises: + None + + """ + for (repo_id, repo_name) in repos: + logging.info('load repo %s into queue' % repo_name) + repo_url = repository_scheme % repo_name + model_data = {'repo_id': repo_id, + 'repo_url': repo_url, + 'repo_name': repo_name} + tasks.orchestrate_clone_with_measure.delay(model_data) + + +_run_fn = { + True : just_print_repos +} + def produce(conf): """Make workers clone repositories. """ db_url = conf['db_url'] - table_clones = conf['table_clones'] - limit = conf['limit'] - offset = conf['offset'] + dry_run = conf['dry_run'] repository_scheme = conf['repository_scheme'] - with db.connect(db_url) as db_conn: - repos = models.load_random_sample(db_conn, table_clones, limit, offset) - - for (repo_id, repo_name) in repos: - logging.info('load repo %s into queue' % repo_name) - repo_url = repository_scheme % repo_name - model_data = {'repo_id': repo_id, - 'repo_url': repo_url, - 'repo_name': repo_name} + run_fn = _run_fn.get(dry_run, post_to_task_queue) - tasks.orchestrate_clone_with_measure.delay(model_data) + with db.connect(db_url) as db_conn: + repos = models.load_repos(db_conn) + run_fn(repository_scheme, repos)