diff --git a/sql/pimp_db.sql b/sql/crawler.sql similarity index 67% copy from sql/pimp_db.sql copy to sql/crawler.sql index f038bf7..491f382 100644 --- a/sql/pimp_db.sql +++ b/sql/crawler.sql @@ -1,114 +1,78 @@ -CREATE VIEW orig_repos AS - SELECT id, name, full_name, html_url, description, last_seen - FROM repos - WHERE NOT fork; - -CREATE VIEW fork_repos AS - SELECT id, name, full_name, html_url, description, last_seen - FROM repos - WHERE fork - -CREATE EXTENSION pg_trgm; - -CREATE INDEX ix_trgm_repos_description ON - repos USING gin (description gin_trgm_ops); - -CREATE INDEX ix_trgm_repos_full_name ON - repos USING gin (full_name gin_trgm_ops); - -CREATE TABLE repos_history ( - ts timestamp DEFAULT current_timestamp, - repos integer NOT NULL, - fork_repos integer, - orig_repos integer -); - -CREATE VIEW repo_creations AS - SELECT today.ts :: date as date, - today.repos - yesterday.repos as repos, - today.fork_repos - yesterday.fork_repos as fork_repos, - today.orig_repos - yesterday.orig_repos as orig_repos - FROM repos_history today - JOIN repos_history yesterday ON - (yesterday.ts = (SELECT max(ts) - FROM repos_history - WHERE ts < today.ts)); - -- -- return a random sample of repos, containing %percent repositories -- create or replace function repos_random_sample_array(percent real) -- returns setof repos as $$ -- declare -- samples integer; -- repo repos%rowtype; -- ids integer[]; -- begin -- select floor(count(*) / 100 * percent) into samples from repos; -- ids := array(select id from repos order by id); -- for i in 1 .. samples loop -- select * into repo -- from repos -- where id = ids[round(random() * samples)]; -- return next repo; -- end loop; -- return; -- end -- $$ -- language plpgsql; -- return a random sample of repositories create or replace function repos_random_sample(percent real) returns setof repos as $$ declare sample_size integer; begin select floor(count(*) / 100 * percent) into sample_size from repos; return query select * from repos order by random() limit sample_size; return; end $$ language plpgsql; -- -- return a random sample of repositories -- create or replace function random_sample_sequence(percent real) -- returns setof repos as $$ -- declare -- sample_size integer; -- seq_size integer; -- min_id integer; -- max_id integer; -- begin -- select floor(count(*) / 100 * percent) into sample_size from repos; -- select min(id) into min_id from repos; -- select max(id) into max_id from repos; -- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence -- -- to have enough of them -- return query -- select * from repos -- where id in -- (select floor(random() * (max_id - min_id + 1))::integer -- + min_id -- from generate_series(1, seq_size)) -- order by random() limit sample_size; -- return; -- end -- $$ -- language plpgsql; create or replace function repos_well_known() returns setof repos as $$ begin return query select * from repos where full_name like 'apache/%' or full_name like 'eclipse/%' or full_name like 'mozilla/%' or full_name = 'torvalds/linux' or full_name = 'gcc-mirror/gcc'; return; end $$ language plpgsql; diff --git a/sql/pimp_db.sql b/sql/pimp_db.sql index f038bf7..9e59e1c 100644 --- a/sql/pimp_db.sql +++ b/sql/pimp_db.sql @@ -1,114 +1,36 @@ CREATE VIEW orig_repos AS SELECT id, name, full_name, html_url, description, last_seen FROM repos WHERE NOT fork; CREATE VIEW fork_repos AS SELECT id, name, full_name, html_url, description, last_seen FROM repos WHERE fork CREATE EXTENSION pg_trgm; CREATE INDEX ix_trgm_repos_description ON repos USING gin (description gin_trgm_ops); CREATE INDEX ix_trgm_repos_full_name ON repos USING gin (full_name gin_trgm_ops); CREATE TABLE repos_history ( ts timestamp DEFAULT current_timestamp, repos integer NOT NULL, fork_repos integer, orig_repos integer ); CREATE VIEW repo_creations AS SELECT today.ts :: date as date, today.repos - yesterday.repos as repos, today.fork_repos - yesterday.fork_repos as fork_repos, today.orig_repos - yesterday.orig_repos as orig_repos FROM repos_history today JOIN repos_history yesterday ON (yesterday.ts = (SELECT max(ts) FROM repos_history WHERE ts < today.ts)); - --- -- return a random sample of repos, containing %percent repositories --- create or replace function repos_random_sample_array(percent real) --- returns setof repos as $$ --- declare --- samples integer; --- repo repos%rowtype; --- ids integer[]; --- begin --- select floor(count(*) / 100 * percent) into samples from repos; --- ids := array(select id from repos order by id); --- for i in 1 .. samples loop --- select * into repo --- from repos --- where id = ids[round(random() * samples)]; --- return next repo; --- end loop; --- return; --- end --- $$ --- language plpgsql; - --- return a random sample of repositories -create or replace function repos_random_sample(percent real) -returns setof repos as $$ -declare - sample_size integer; -begin - select floor(count(*) / 100 * percent) into sample_size from repos; - return query - select * from repos - order by random() - limit sample_size; - return; -end -$$ -language plpgsql; - --- -- return a random sample of repositories --- create or replace function random_sample_sequence(percent real) --- returns setof repos as $$ --- declare --- sample_size integer; --- seq_size integer; --- min_id integer; --- max_id integer; --- begin --- select floor(count(*) / 100 * percent) into sample_size from repos; --- select min(id) into min_id from repos; --- select max(id) into max_id from repos; --- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence --- -- to have enough of them --- return query --- select * from repos --- where id in --- (select floor(random() * (max_id - min_id + 1))::integer --- + min_id --- from generate_series(1, seq_size)) --- order by random() limit sample_size; --- return; --- end --- $$ --- language plpgsql; - -create or replace function repos_well_known() -returns setof repos as $$ -begin - return query - select * from repos - where full_name like 'apache/%' - or full_name like 'eclipse/%' - or full_name like 'mozilla/%' - or full_name = 'torvalds/linux' - or full_name = 'gcc-mirror/gcc'; - return; -end -$$ -language plpgsql;