Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/cli.py
Show First 20 Lines • Show All 175 Lines • ▼ Show 20 Lines | ): | |||||||||
from swh.dataset.athena import run_query_get_results | from swh.dataset.athena import run_query_get_results | |||||||||
print( | print( | |||||||||
run_query_get_results( | run_query_get_results( | |||||||||
database_name, query_file.read(), output_location=output_location, | database_name, query_file.read(), output_location=output_location, | |||||||||
), | ), | |||||||||
end="", | end="", | |||||||||
) # CSV already ends with \n | ) # CSV already ends with \n | |||||||||
@athena.command("gensubdataset") | ||||||||||
@click.option("--database", "-d", default="swh", help="Name of the base database") | ||||||||||
@click.option( | ||||||||||
vlorentzUnsubmitted Not Done Inline Actions
vlorentz: | ||||||||||
"--subdataset-database", required=True, | ||||||||||
help="Name of the subdataset database to create" | ||||||||||
) | ||||||||||
@click.option( | ||||||||||
"--subdataset-location", | ||||||||||
required=True, | ||||||||||
help="S3 prefix where the subdataset should be stored", | ||||||||||
) | ||||||||||
@click.option( | ||||||||||
"--swhids", | ||||||||||
required=True, | ||||||||||
help="File containing the list of SWHIDs to include in the subdataset", | ||||||||||
) | ||||||||||
def athena_gensubdataset(database, subdataset_database, subdataset_location, swhids): | ||||||||||
""" | ||||||||||
Generate a subdataset with Athena, from an existing database and a list | ||||||||||
Not Done Inline ActionsPlease define "subdataset" better. Is it the transitive closure of the given SWHIDs? or just the subgraph induced by them? vlorentz: Please define "subdataset" better. Is it the transitive closure of the given SWHIDs? or just… | ||||||||||
Done Inline ActionsIt's literally "the SWHIDs to include in the subdataset". What's computed is the intersection between the base dataset and the swhids contained in the file. I will try to expand the description more. seirl: It's literally "the SWHIDs to include in the subdataset". What's computed is the intersection… | ||||||||||
of SWHIDs. Athena will generate a new dataset with the same tables as in | ||||||||||
the base dataset, but only containing the objects present in the SWHID | ||||||||||
list. | ||||||||||
""" | ||||||||||
from swh.dataset.athena import generate_subdataset | ||||||||||
generate_subdataset( | ||||||||||
database, | ||||||||||
subdataset_database, | ||||||||||
subdataset_location, | ||||||||||
swhids, | ||||||||||
os.path.join(subdataset_location, "queries"), | ||||||||||
) |