-- DONE CREATE TABLE ctas_dataset_dir_to_rev WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/dir_to_rev/', field_delimiter = ' ') AS SELECT to_hex(directory.id) as source, to_hex(target) as dest FROM directory CROSS JOIN UNNEST(rev_entries) as t(rev_id) INNER JOIN directory_entry_rev ON directory_entry_rev.id = rev_id; -- TODO: error: -- Query exhausted resources at this scale factor. CREATE TABLE ctas_dataset_dir_to_file WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/dir_to_file/', field_delimiter = ' ') AS SELECT to_hex(directory.id) as source, to_hex(target) as dest FROM directory CROSS JOIN UNNEST(file_entries) as t(file_id) INNER JOIN directory_entry_file ON directory_entry_file.id = file_id; -- TODO: error: -- Query exhausted resources at this scale factor. CREATE TABLE ctas_dataset_dir_to_dir WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/dir_to_dir/', field_delimiter = ' ') AS SELECT to_hex(directory.id) as source, to_hex(target) as dest FROM directory CROSS JOIN UNNEST(dir_entries) as t(dir_id) INNER JOIN directory_entry_dir ON directory_entry_dir.id = dir_id; -- DONE CREATE TABLE ctas_dataset_rev_to_dir WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/rev_to_dir/', field_delimiter = ' ') AS SELECT to_hex(id) as source, to_hex(directory) as dest FROM revision; -- DONE CREATE TABLE ctas_dataset_revision WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/revision/', field_delimiter = ' ') AS SELECT to_hex(parent_id) as source, to_hex(id) as dest FROM revision_history; -- DONE CREATE TABLE ctas_dataset_release WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/release/', field_delimiter = ' ') AS SELECT to_hex(id) as source, to_hex(target) as dest FROM release; -- DONE CREATE TABLE ctas_dataset_snapshot WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/snapshot/', field_delimiter = ' ') AS SELECT to_hex(snapshot.id) as source, to_hex(snapshot_branch.target) as dest FROM snapshot_branch INNER JOIN snapshot_branches on snapshot_branches.branch_id = snapshot_branch.object_id INNER JOIN snapshot on snapshot_branches.snapshot_id = snapshot.object_id; -- TODO: error: -- HIVE_BAD_DATA: Field snapshot_id's type DOUBLE in parquet is incompatible -- with type bigint defined in table schema. CREATE TABLE ctas_dataset_origin WITH (format = 'TEXTFILE', external_location = 's3://softwareheritage/edges_dataset/origin/', field_delimiter = ' ') AS SELECT origin as source, to_hex(snapshot.id) as dest FROM origin_visit INNER JOIN snapshot on snapshot_id = snapshot.object_id;