Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9313184
crawler.sql
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Subscribers
None
crawler.sql
View Options
-- -- return a random sample of repos, containing %percent repositories
-- create or replace function repos_random_sample_array(percent real)
-- returns setof repos as $$
-- declare
-- samples integer;
-- repo repos%rowtype;
-- ids integer[];
-- begin
-- select floor(count(*) / 100 * percent) into samples from repos;
-- ids := array(select id from repos order by id);
-- for i in 1 .. samples loop
-- select * into repo
-- from repos
-- where id = ids[round(random() * samples)];
-- return next repo;
-- end loop;
-- return;
-- end
-- $$
-- language plpgsql;
-- return a random sample of repositories
create
or
replace
function
repos_random_sample
(
percent
real
)
returns
setof
repos
as
$$
declare
sample_size
integer
;
begin
select
floor
(
count
(
*
)
/
100
*
percent
)
into
sample_size
from
repos
;
return
query
select
*
from
repos
order
by
random
()
limit
sample_size
;
return
;
end
$$
language
plpgsql
;
-- -- return a random sample of repositories
-- create or replace function random_sample_sequence(percent real)
-- returns setof repos as $$
-- declare
-- sample_size integer;
-- seq_size integer;
-- min_id integer;
-- max_id integer;
-- begin
-- select floor(count(*) / 100 * percent) into sample_size from repos;
-- select min(id) into min_id from repos;
-- select max(id) into max_id from repos;
-- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence
-- -- to have enough of them
-- return query
-- select * from repos
-- where id in
-- (select floor(random() * (max_id - min_id + 1))::integer
-- + min_id
-- from generate_series(1, seq_size))
-- order by random() limit sample_size;
-- return;
-- end
-- $$
-- language plpgsql;
create
or
replace
function
repos_well_known
()
returns
setof
repos
as
$$
begin
return
query
select
*
from
repos
where
full_name
like
'apache/%'
or
full_name
like
'eclipse/%'
or
full_name
like
'mozilla/%'
or
full_name
=
'torvalds/linux'
or
full_name
=
'gcc-mirror/gcc'
;
return
;
end
$$
language
plpgsql
;
create
table
crawl_history
(
id
bigserial
primary
key
,
repo
integer
references
repos
(
id
),
task_id
uuid
,
-- celery task id
date
timestamptz
not
null
,
duration
interval
,
status
boolean
,
result
json
,
stdout
text
,
stderr
text
);
create
index
on
crawl_history
(
repo
);
create
view
missing_orig_repos
AS
select
*
from
orig_repos
as
repos
where
not
exists
(
select
1
from
crawl_history
as
history
where
history
.
repo
=
repos
.
id
);
create
view
missing_fork_repos
AS
select
*
from
fork_repos
as
repos
where
not
exists
(
select
1
from
crawl_history
as
history
where
history
.
repo
=
repos
.
id
);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 11:27 AM (1 w, 10 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3238955
Attached To
rDLS Listers
Event Timeline
Log In to Comment