diff --git a/ardumont/from-output-to-csv.py b/ardumont/from-output-to-csv.py index 9f6a75a..244ecb9 100755 --- a/ardumont/from-output-to-csv.py +++ b/ardumont/from-output-to-csv.py @@ -1,77 +1,78 @@ # Use sample: # cat mercurial.output.txt | \ # grep 'sensible-filter' | \ # python3 -m from-output-to-csv # The output of this command is designed to feed the # `swh.scheduler.cli task schedule` subcommand import ast import click import json import sys TYPES = { 'mercurial': 'origin-update-hg', 'mercurial-archive': 'origin-load-archive-hg', 'svn': 'origin-update-svn', 'svn-archive': 'swh-loader-mount-dump-and-load-svn-repository', 'pypi': 'origin-update-pypi', } @click.command() @click.option('--task-policy', default='oneshot', type=click.Choice([ 'oneshot', 'recurring']), help="Task's policy") @click.option('--task-type', default='mercurial', type=click.Choice( TYPES.keys()), help="Task's type") def main(task_policy, task_type): """Given an output from kibana_fetch_logs, transform the input in csv format (; as delimiter) to ease back scheduling for those origins. The output format is of the form: ;;task-args;task-kwargs Then use of `swh.scheduler.cli schedule` subcommand. cat | \ python3 -m swh.mercurial.cli task schedule \ --columns type \ --columns policy \ --columns args \ --columns kwargs \ --delimiter ';' - """ for line in sys.stdin: line = line.rstrip() data = ast.literal_eval(line) _task_type = TYPES.get(task_type) _task_args = json.dumps(data['args']) kwargs = data['kwargs'] # HACK: visit_date should have been set earlier... if task_type in ['mercurial', 'mercurial-archive']: if 'visit_date' not in kwargs: kwargs['visit_date'] = 'Tue, 3 May 2016 17:16:32 +0200' elif task_type == 'pypi': - # HACK: Need an adapter because loader.pypi.tasks and - # loader.pypi.loader have not the same signature ~> need - # some code fix - url = kwargs.pop('origin_metadata_url') - kwargs['project_metadata_url'] = url + # HACK: Need an adapter as long as T1246#23691 is not + # solved, we will need this (task and loader are now + # aligned but the indexed data is there) + if 'origin_metadata_url' in kwargs: + url = kwargs.pop('origin_metadata_url') + kwargs['project_metadata_url'] = url _task_kwargs = json.dumps(kwargs) print(';'.join([_task_type, task_policy, _task_args, _task_kwargs])) if __name__ == '__main__': main()