Changeset View
Changeset View
Standalone View
Standalone View
swh/dataset/athena.py
Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines | def create_tables(database_name, dataset_location, output_location=None, replace=False): | ||||
Create the Software Heritage Dataset tables on AWS Athena. | Create the Software Heritage Dataset tables on AWS Athena. | ||||
Athena works on external columnar data stored in S3, but requires a schema | Athena works on external columnar data stored in S3, but requires a schema | ||||
for each table to run queries. This creates all the necessary tables | for each table to run queries. This creates all the necessary tables | ||||
remotely by using the relational schemas in swh.dataset.relational. | remotely by using the relational schemas in swh.dataset.relational. | ||||
""" | """ | ||||
client = boto3.client("athena") | client = boto3.client("athena") | ||||
client.output_location = output_location | client.output_location = output_location | ||||
client.database_name = database_name | |||||
client.database_name = "default" # we have to pick some existing database | |||||
query( | query( | ||||
client, | client, | ||||
create_database(database_name), | create_database(database_name), | ||||
desc="Creating {} database".format(database_name), | desc="Creating {} database".format(database_name), | ||||
) | ) | ||||
client.database_name = database_name | |||||
if replace: | if replace: | ||||
for table in TABLES: | for table in TABLES: | ||||
query( | query( | ||||
client, | client, | ||||
drop_table(database_name, table), | drop_table(database_name, table), | ||||
desc="Dropping table {}".format(table), | desc="Dropping table {}".format(table), | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 158 Lines • Show Last 20 Lines |