Differential D7010 Diff 25518 swh/dataset/cli.py

Changeset View

Standalone View

View Options

swh/dataset/cli.py

Show First 20 Lines • Show All 175 Lines • ▼ Show 20 Lines

from swh.dataset.athena import run_query_get_results

print(

run_query_get_results(

database_name, query_file.read(), output_location=output_location,

end="",

) # CSV already ends with \n

@athena.command("gensubdataset")

@click.option("--database", "-d", default="swh", help="Name of the base database")

@click.option(

vlorentzUnsubmitted

Not Done

@click.option(

- "--subdataset-database", required=True, help="Name of the subdataset database"

+ "--subdataset-database", required=True, help="Name of the subdataset database to create"

)

@click.option(

vlorentz:

"--subdataset-database", required=True,

help="Name of the subdataset database to create"

)

@click.option(

"--subdataset-location",

required=True,

help="S3 prefix where the subdataset should be stored",

)

@click.option(

"--swhids",

required=True,

help="File containing the list of SWHIDs to include in the subdataset",

)

def athena_gensubdataset(database, subdataset_database, subdataset_location, swhids):

"""

Generate a subdataset with Athena, from an existing database and a list

vlorentzUnsubmitted

Not Done

Please define "subdataset" better. Is it the transitive closure of the given SWHIDs? or just the subgraph induced by them?

vlorentz: Please define "subdataset" better. Is it the transitive closure of the given SWHIDs? or just…

seirlAuthorUnsubmitted

Done

It's literally "the SWHIDs to include in the subdataset". What's computed is the intersection between the base dataset and the swhids contained in the file. I will try to expand the description more.

seirl: It's literally "the SWHIDs to include in the subdataset". What's computed is the intersection…

of SWHIDs. Athena will generate a new dataset with the same tables as in

the base dataset, but only containing the objects present in the SWHID

list.

"""

from swh.dataset.athena import generate_subdataset

generate_subdataset(

database,

subdataset_database,

subdataset_location,

swhids,

os.path.join(subdataset_location, "queries"),

)