From 752e23eb4b8b262aa21b315ff340b21cdb1f32eb Mon Sep 17 00:00:00 2001 From: Stefano Zacchiroli Date: Tue, 7 Jul 2015 11:07:19 +0200 Subject: [PATCH] SQL: move stuff not strictly related to ghlister to crawler.sql --- sql/crawler.sql | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ sql/pimp_db.sql | 78 ------------------------------------------------- 2 files changed, 78 insertions(+), 78 deletions(-) create mode 100644 sql/crawler.sql diff --git a/sql/crawler.sql b/sql/crawler.sql new file mode 100644 index 0000000..491f382 --- /dev/null +++ b/sql/crawler.sql @@ -0,0 +1,78 @@ + +-- -- return a random sample of repos, containing %percent repositories +-- create or replace function repos_random_sample_array(percent real) +-- returns setof repos as $$ +-- declare +-- samples integer; +-- repo repos%rowtype; +-- ids integer[]; +-- begin +-- select floor(count(*) / 100 * percent) into samples from repos; +-- ids := array(select id from repos order by id); +-- for i in 1 .. samples loop +-- select * into repo +-- from repos +-- where id = ids[round(random() * samples)]; +-- return next repo; +-- end loop; +-- return; +-- end +-- $$ +-- language plpgsql; + +-- return a random sample of repositories +create or replace function repos_random_sample(percent real) +returns setof repos as $$ +declare + sample_size integer; +begin + select floor(count(*) / 100 * percent) into sample_size from repos; + return query + select * from repos + order by random() + limit sample_size; + return; +end +$$ +language plpgsql; + +-- -- return a random sample of repositories +-- create or replace function random_sample_sequence(percent real) +-- returns setof repos as $$ +-- declare +-- sample_size integer; +-- seq_size integer; +-- min_id integer; +-- max_id integer; +-- begin +-- select floor(count(*) / 100 * percent) into sample_size from repos; +-- select min(id) into min_id from repos; +-- select max(id) into max_id from repos; +-- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence +-- -- to have enough of them +-- return query +-- select * from repos +-- where id in +-- (select floor(random() * (max_id - min_id + 1))::integer +-- + min_id +-- from generate_series(1, seq_size)) +-- order by random() limit sample_size; +-- return; +-- end +-- $$ +-- language plpgsql; + +create or replace function repos_well_known() +returns setof repos as $$ +begin + return query + select * from repos + where full_name like 'apache/%' + or full_name like 'eclipse/%' + or full_name like 'mozilla/%' + or full_name = 'torvalds/linux' + or full_name = 'gcc-mirror/gcc'; + return; +end +$$ +language plpgsql; diff --git a/sql/pimp_db.sql b/sql/pimp_db.sql index f038bf7..9e59e1c 100644 --- a/sql/pimp_db.sql +++ b/sql/pimp_db.sql @@ -34,81 +34,3 @@ CREATE VIEW repo_creations AS (yesterday.ts = (SELECT max(ts) FROM repos_history WHERE ts < today.ts)); - --- -- return a random sample of repos, containing %percent repositories --- create or replace function repos_random_sample_array(percent real) --- returns setof repos as $$ --- declare --- samples integer; --- repo repos%rowtype; --- ids integer[]; --- begin --- select floor(count(*) / 100 * percent) into samples from repos; --- ids := array(select id from repos order by id); --- for i in 1 .. samples loop --- select * into repo --- from repos --- where id = ids[round(random() * samples)]; --- return next repo; --- end loop; --- return; --- end --- $$ --- language plpgsql; - --- return a random sample of repositories -create or replace function repos_random_sample(percent real) -returns setof repos as $$ -declare - sample_size integer; -begin - select floor(count(*) / 100 * percent) into sample_size from repos; - return query - select * from repos - order by random() - limit sample_size; - return; -end -$$ -language plpgsql; - --- -- return a random sample of repositories --- create or replace function random_sample_sequence(percent real) --- returns setof repos as $$ --- declare --- sample_size integer; --- seq_size integer; --- min_id integer; --- max_id integer; --- begin --- select floor(count(*) / 100 * percent) into sample_size from repos; --- select min(id) into min_id from repos; --- select max(id) into max_id from repos; --- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence --- -- to have enough of them --- return query --- select * from repos --- where id in --- (select floor(random() * (max_id - min_id + 1))::integer --- + min_id --- from generate_series(1, seq_size)) --- order by random() limit sample_size; --- return; --- end --- $$ --- language plpgsql; - -create or replace function repos_well_known() -returns setof repos as $$ -begin - return query - select * from repos - where full_name like 'apache/%' - or full_name like 'eclipse/%' - or full_name like 'mozilla/%' - or full_name = 'torvalds/linux' - or full_name = 'gcc-mirror/gcc'; - return; -end -$$ -language plpgsql;