Page MenuHomeSoftware Heritage
Paste P345

(An Untitled Masterwork)
ActivePublic

Authored by seirl on Jan 7 2019, 7:15 PM.
-- DONE
CREATE TABLE ctas_dataset_dir_to_rev
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/dir_to_rev/', field_delimiter = ' ')
AS SELECT to_hex(directory.id) as source, to_hex(target) as dest
FROM directory
CROSS JOIN UNNEST(rev_entries) as t(rev_id)
INNER JOIN directory_entry_rev ON directory_entry_rev.id = rev_id;
-- TODO: error:
-- Query exhausted resources at this scale factor.
CREATE TABLE ctas_dataset_dir_to_file
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/dir_to_file/', field_delimiter = ' ')
AS SELECT to_hex(directory.id) as source, to_hex(target) as dest
FROM directory
CROSS JOIN UNNEST(file_entries) as t(file_id)
INNER JOIN directory_entry_file ON directory_entry_file.id = file_id;
-- TODO: error:
-- Query exhausted resources at this scale factor.
CREATE TABLE ctas_dataset_dir_to_dir
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/dir_to_dir/', field_delimiter = ' ')
AS SELECT to_hex(directory.id) as source, to_hex(target) as dest
FROM directory
CROSS JOIN UNNEST(dir_entries) as t(dir_id)
INNER JOIN directory_entry_dir ON directory_entry_dir.id = dir_id;
-- DONE
CREATE TABLE ctas_dataset_rev_to_dir
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/rev_to_dir/', field_delimiter = ' ')
AS SELECT to_hex(id) as source, to_hex(directory) as dest
FROM revision;
-- DONE
CREATE TABLE ctas_dataset_revision
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/revision/', field_delimiter = ' ')
AS SELECT to_hex(parent_id) as source, to_hex(id) as dest
FROM revision_history;
-- DONE
CREATE TABLE ctas_dataset_release
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/release/', field_delimiter = ' ')
AS SELECT to_hex(id) as source, to_hex(target) as dest
FROM release;
-- DONE
CREATE TABLE ctas_dataset_snapshot
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/snapshot/', field_delimiter = ' ')
AS SELECT
to_hex(snapshot.id) as source,
to_hex(snapshot_branch.target) as dest
FROM snapshot_branch
INNER JOIN snapshot_branches on snapshot_branches.branch_id = snapshot_branch.object_id
INNER JOIN snapshot on snapshot_branches.snapshot_id = snapshot.object_id;
-- TODO: error:
-- HIVE_BAD_DATA: Field snapshot_id's type DOUBLE in parquet is incompatible
-- with type bigint defined in table schema.
CREATE TABLE ctas_dataset_origin
WITH (format = 'TEXTFILE', external_location =
's3://softwareheritage/edges_dataset/origin/', field_delimiter = ' ')
AS SELECT
origin as source,
to_hex(snapshot.id) as dest
FROM origin_visit
INNER JOIN snapshot on snapshot_id = snapshot.object_id;

Event Timeline