Page MenuHomeSoftware Heritage
Paste P1529

import the license dataset into sqlite
ActivePublic

Authored by zack on Nov 14 2022, 4:47 PM.
-- Import the Software Heritage License Dataset into a SQLite database
-- Sample usage: "sqlite3 licenses.sqlite '.read import-dataset.sql'"
-- Related: https://forge.softwareheritage.org/T4683
DROP TABLE IF EXISTS "license_blobs" ;
CREATE TABLE "license_blobs" ("swhid" TEXT, "sha1" TEXT, "name" TEXT);
.import --csv --skip 1 "| zstdcat license-blobs.csv.zst" "license_blobs"
CREATE INDEX idx_license_blobs_sha1 ON license_blobs (sha1);
CREATE INDEX idx_license_blobs_swhid ON license_blobs (swhid);
DROP TABLE IF EXISTS "blobs_earliest";
CREATE TABLE "blobs_earliest" ("swhid" TEXT, "commit" TEXT, "timestamp" INTEGER, "occurrences" INTEGER);
.mode tabs
.import --skip 1 "| zstdcat blobs-earliest.csv.zst" "blobs_earliest"
.mode list
CREATE INDEX idx_blobs_earliest_swhid ON blobs_earliest (swhid);
DROP TABLE IF EXISTS "blobs_fileinfo";
CREATE TABLE "blobs_fileinfo" ("sha1" TEXT, "mime_type" TEXT, "encoding" TEXT, "line_count" INTEGER, "word_count" INTEGER, "size" INTEGER);
.import --csv --skip 1 "| zstdcat blobs-fileinfo.csv.zst" "blobs_fileinfo"
CREATE INDEX idx_blobs_fileinfo_sha1 ON blobs_fileinfo (sha1);
DROP TABLE IF EXISTS "blobs_nb_origins";
CREATE TABLE "blobs_nb_origins" ("swhid" TEXT, "count" INTEGER);
.mode tabs
.import "| zstdcat blobs-nb-origins.csv.zst" "blobs_nb_origins"
.mode list
CREATE INDEX idx_blobs_nb_origins_swhid ON blobs_nb_origins (swhid);
DROP TABLE IF EXISTS "blobs_origins";
CREATE TABLE "blobs_origins" ("swhid" TEXT, "url" TEXT);
.mode tabs
.import "| zstdcat blobs-origins.csv.zst" "blobs_origins"
.mode list
CREATE INDEX idx_blobs_origins_swhid ON blobs_origins (swhid);
DROP TABLE IF EXISTS "blobs_scancode";
CREATE TABLE "blobs_scancode" ("sha1" TEXT, "license" TEXT, score REAL);
.import --csv --skip 1 "| zstdcat blobs-scancode.csv.zst" "blobs_scancode"
CREATE INDEX idx_blobs_scancode_sha1 ON blobs_scancode (sha1);