Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/cli.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
# WARNING: do not import unnecessary things here to keep cli startup time under | # WARNING: do not import unnecessary things here to keep cli startup time under | ||||
# control | # control | ||||
import os | import os | ||||
import pathlib | import pathlib | ||||
import sys | |||||
import click | import click | ||||
from swh.core.cli import CONTEXT_SETTINGS | from swh.core.cli import CONTEXT_SETTINGS | ||||
from swh.core.cli import swh as swh_cli_group | from swh.core.cli import swh as swh_cli_group | ||||
from swh.dataset.exporters.edges import GraphEdgesExporter | from swh.dataset.exporters.edges import GraphEdgesExporter | ||||
from swh.dataset.exporters.orc import ORCExporter | from swh.dataset.exporters.orc import ORCExporter | ||||
from swh.dataset.journalprocessor import ParallelJournalProcessor | from swh.dataset.journalprocessor import ParallelJournalProcessor | ||||
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines | |||||
@graph.command("sort") | @graph.command("sort") | ||||
@click.argument("export-path", type=click.Path()) | @click.argument("export-path", type=click.Path()) | ||||
@click.pass_context | @click.pass_context | ||||
def sort_graph(ctx, export_path): | def sort_graph(ctx, export_path): | ||||
config = ctx.obj["config"] | config = ctx.obj["config"] | ||||
from swh.dataset.exporters.edges import sort_graph_nodes | from swh.dataset.exporters.edges import sort_graph_nodes | ||||
sort_graph_nodes(export_path, config) | sort_graph_nodes(export_path, config) | ||||
@dataset_cli_group.group("athena") | |||||
@click.pass_context | |||||
def athena(ctx): | |||||
zack: maybe "manage and query" | |||||
"""Manage and query a remote AWS Athena database""" | |||||
pass | |||||
@athena.command("create") | |||||
@click.option( | |||||
"--database-name", "-d", default="swh", help="Name of the database to create" | |||||
) | |||||
@click.option( | |||||
"--location-prefix", | |||||
"-l", | |||||
required=True, | |||||
help="S3 prefix where the dataset can be found", | |||||
) | |||||
@click.option( | |||||
"-o", "--output-location", help="S3 prefix where results should be stored" | |||||
) | |||||
@click.option( | |||||
"-r", "--replace-tables", is_flag=True, help="Replace the tables that already exist" | |||||
) | |||||
def athena_create( | |||||
database_name, location_prefix, output_location=None, replace_tables=False | |||||
): | |||||
"""Create tables on AWS Athena pointing to a given graph dataset on S3.""" | |||||
from swh.dataset.athena import create_tables | |||||
create_tables( | |||||
database_name, | |||||
location_prefix, | |||||
output_location=output_location, | |||||
replace=replace_tables, | |||||
) | |||||
@athena.command("query") | |||||
@click.option( | |||||
"--database-name", "-d", default="swh", help="Name of the database to query" | |||||
) | |||||
@click.option( | |||||
"-o", "--output-location", help="S3 prefix where results should be stored" | |||||
) | |||||
Done Inline ActionsSo the query must be given in a file? That seems cumbersome. How about expecting it as a string argument, either as an option, but even as the only way to pass a query; after all one can always use shell expansion $(cat ...) if the query is too long for the command line. zack: So the query must be given in a file? That seems cumbersome. How about expecting it as a string… | |||||
Done Inline ActionsI'm mimicking how most database command lines work. You can also write <<<"query" if you want to inline it (or echo "query" | swh athena query), and heredocs are more natural this way. seirl: I'm mimicking how most database command lines work. You can also write `<<<"query"` if you want… | |||||
@click.argument("query_file", type=click.File("r"), default=sys.stdin) | |||||
def athena_query( | |||||
database_name, query_file, output_location=None, | |||||
): | |||||
"""Query the AWS Athena database with a given command""" | |||||
from swh.dataset.athena import run_query_get_results | |||||
print( | |||||
run_query_get_results( | |||||
database_name, query_file.read(), output_location=output_location, | |||||
), | |||||
end="", | |||||
) # CSV already ends with \n |
maybe "manage and query"