diff --git a/swh/dataset/cli.py b/swh/dataset/cli.py new file mode 100644 --- /dev/null +++ b/swh/dataset/cli.py @@ -0,0 +1,55 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import uuid +import click +import functools + +from swh.core import config +from swh.core.cli import CONTEXT_SETTINGS +from swh.dataset import graph +from swh.journal.cli import get_journal_client + + +@click.group(name='dataset', context_settings=CONTEXT_SETTINGS) +@click.option('--config-file', '-C', default=None, + type=click.Path(exists=True, dir_okay=False,), + help="Configuration file.") +@click.pass_context +def cli(ctx, config_file): + '''Software Heritage Dataset Tools''' + ctx.ensure_object(dict) + + conf = config.read(config_file) + ctx.obj['config'] = conf + + +@cli.command('export-graph') +@click.argument('export-path', type=click.File('w')) +@click.option('--export-id', '-e', help="Unique ID of the export run.") +@click.pass_context +def export_graph(ctx, export_path, export_id): + if not export_id: + export_id = str(uuid.uuid4()) + client = get_journal_client( + ctx, + object_types=[ + 'origin_visit', + 'snapshot', + 'release', + 'revision', + 'directory', + ], + group_id=('swh-dataset-export-' + export_id) + ) + process_messages = functools.partial(graph.process_messages, + output=export_path) + nb_messages = 0 + try: + while True: + nb_messages += client.process(process_messages) + print('Processed {} messages.'.format(nb_messages)) + except KeyboardInterrupt: + ctx.exit(0) diff --git a/swh/dataset/graph.py b/swh/dataset/graph.py new file mode 100644 --- /dev/null +++ b/swh/dataset/graph.py @@ -0,0 +1,39 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model.identifiers import origin_identifier +from swh.model.hashutil import hash_to_hex + + +def process_messages(messages, output): + def write(src, dst): + if src is None or dst is None: + return + print(hash_to_hex(src), hash_to_hex(dst), file=output) + + """ + for origin in messages['origin']: + for src in origin: + print(src, dst, file=output) + """ + for visit in messages.get('origin_visit', []): + write(origin_identifier({'url': visit['origin']}), visit['snapshot']) + + for snapshot in messages.get('snapshot', []): + for branch in snapshot['branches'].values(): + while branch['target_type'] == 'alias': + branch = snapshot['branches'][branch['target']] + write(snapshot['id'], branch['target']) + + for release in messages.get('release', []): + write(release['id'], release['target']) + + for revision in messages.get('revision', []): + for parent in revision['parents']: + write(revision['id'], parent) + + for directory in messages.get('directory', []): + for entry in directory['entries']: + write(directory['id'], entry['target'])