SQL: move stuff not strictly related to ghlister to crawler.sql

This commit is contained in:
Stefano Zacchiroli 2015-07-07 11:07:19 +02:00
parent df959dd584
commit 752e23eb4b
2 changed files with 78 additions and 78 deletions

78
sql/crawler.sql Normal file
View file

@ -0,0 +1,78 @@
-- -- return a random sample of repos, containing %percent repositories
-- create or replace function repos_random_sample_array(percent real)
-- returns setof repos as $$
-- declare
-- samples integer;
-- repo repos%rowtype;
-- ids integer[];
-- begin
-- select floor(count(*) / 100 * percent) into samples from repos;
-- ids := array(select id from repos order by id);
-- for i in 1 .. samples loop
-- select * into repo
-- from repos
-- where id = ids[round(random() * samples)];
-- return next repo;
-- end loop;
-- return;
-- end
-- $$
-- language plpgsql;
-- return a random sample of repositories
create or replace function repos_random_sample(percent real)
returns setof repos as $$
declare
sample_size integer;
begin
select floor(count(*) / 100 * percent) into sample_size from repos;
return query
select * from repos
order by random()
limit sample_size;
return;
end
$$
language plpgsql;
-- -- return a random sample of repositories
-- create or replace function random_sample_sequence(percent real)
-- returns setof repos as $$
-- declare
-- sample_size integer;
-- seq_size integer;
-- min_id integer;
-- max_id integer;
-- begin
-- select floor(count(*) / 100 * percent) into sample_size from repos;
-- select min(id) into min_id from repos;
-- select max(id) into max_id from repos;
-- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence
-- -- to have enough of them
-- return query
-- select * from repos
-- where id in
-- (select floor(random() * (max_id - min_id + 1))::integer
-- + min_id
-- from generate_series(1, seq_size))
-- order by random() limit sample_size;
-- return;
-- end
-- $$
-- language plpgsql;
create or replace function repos_well_known()
returns setof repos as $$
begin
return query
select * from repos
where full_name like 'apache/%'
or full_name like 'eclipse/%'
or full_name like 'mozilla/%'
or full_name = 'torvalds/linux'
or full_name = 'gcc-mirror/gcc';
return;
end
$$
language plpgsql;

View file

@ -34,81 +34,3 @@ CREATE VIEW repo_creations AS
(yesterday.ts = (SELECT max(ts)
FROM repos_history
WHERE ts < today.ts));
-- -- return a random sample of repos, containing %percent repositories
-- create or replace function repos_random_sample_array(percent real)
-- returns setof repos as $$
-- declare
-- samples integer;
-- repo repos%rowtype;
-- ids integer[];
-- begin
-- select floor(count(*) / 100 * percent) into samples from repos;
-- ids := array(select id from repos order by id);
-- for i in 1 .. samples loop
-- select * into repo
-- from repos
-- where id = ids[round(random() * samples)];
-- return next repo;
-- end loop;
-- return;
-- end
-- $$
-- language plpgsql;
-- return a random sample of repositories
create or replace function repos_random_sample(percent real)
returns setof repos as $$
declare
sample_size integer;
begin
select floor(count(*) / 100 * percent) into sample_size from repos;
return query
select * from repos
order by random()
limit sample_size;
return;
end
$$
language plpgsql;
-- -- return a random sample of repositories
-- create or replace function random_sample_sequence(percent real)
-- returns setof repos as $$
-- declare
-- sample_size integer;
-- seq_size integer;
-- min_id integer;
-- max_id integer;
-- begin
-- select floor(count(*) / 100 * percent) into sample_size from repos;
-- select min(id) into min_id from repos;
-- select max(id) into max_id from repos;
-- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence
-- -- to have enough of them
-- return query
-- select * from repos
-- where id in
-- (select floor(random() * (max_id - min_id + 1))::integer
-- + min_id
-- from generate_series(1, seq_size))
-- order by random() limit sample_size;
-- return;
-- end
-- $$
-- language plpgsql;
create or replace function repos_well_known()
returns setof repos as $$
begin
return query
select * from repos
where full_name like 'apache/%'
or full_name like 'eclipse/%'
or full_name like 'mozilla/%'
or full_name = 'torvalds/linux'
or full_name = 'gcc-mirror/gcc';
return;
end
$$
language plpgsql;