From f1adec6fde73803f1427ea602cebb5eef101e78c Mon Sep 17 00:00:00 2001 From: Stefano Zacchiroli Date: Thu, 2 Jul 2015 18:04:30 +0200 Subject: [PATCH] pimp_db: add plpgsql function to extract random repos sample --- sql/pimp_db.sql | 75 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/sql/pimp_db.sql b/sql/pimp_db.sql index a76bbec..2eeb8ad 100644 --- a/sql/pimp_db.sql +++ b/sql/pimp_db.sql @@ -26,11 +26,74 @@ CREATE TABLE repos_history ( CREATE VIEW repo_creations AS SELECT today.ts :: date as date, - today.repos - yesterday.repos as repos, - today.fork_repos - yesterday.fork_repos as fork_repos, - today.orig_repos - yesterday.orig_repos as orig_repos + today.repos - yesterday.repos as repos, + today.fork_repos - yesterday.fork_repos as fork_repos, + today.orig_repos - yesterday.orig_repos as orig_repos FROM repos_history today JOIN repos_history yesterday ON - (yesterday.ts = (SELECT max(ts) - FROM repos_history - WHERE ts < today.ts)); + (yesterday.ts = (SELECT max(ts) + FROM repos_history + WHERE ts < today.ts)); + +-- -- return a random sample of repos, containing %percent repositories +-- create or replace function repos_random_sample_array(percent real) +-- returns setof repos as $$ +-- declare +-- samples integer; +-- repo repos%rowtype; +-- ids integer[]; +-- begin +-- select floor(count(*) / 100 * percent) into samples from repos; +-- ids := array(select id from repos order by id); +-- for i in 1 .. samples loop +-- select * into repo +-- from repos +-- where id = ids[round(random() * samples)]; +-- return next repo; +-- end loop; +-- return; +-- end +-- $$ +-- language plpgsql; + +-- return a random sample of repositories +create or replace function repos_random_sample(percent real) +returns setof repos as $$ +declare + sample_size integer; +begin + select floor(count(*) / 100 * percent) into sample_size from repos; + return query + select * from repos + order by random() + limit sample_size; + return; +end +$$ +language plpgsql; + +-- -- return a random sample of repositories +-- create or replace function random_sample_sequence(percent real) +-- returns setof repos as $$ +-- declare +-- sample_size integer; +-- seq_size integer; +-- min_id integer; +-- max_id integer; +-- begin +-- select floor(count(*) / 100 * percent) into sample_size from repos; +-- select min(id) into min_id from repos; +-- select max(id) into max_id from repos; +-- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence +-- -- to have enough of them +-- return query +-- select * from repos +-- where id in +-- (select floor(random() * (max_id - min_id + 1))::integer +-- + min_id +-- from generate_series(1, seq_size)) +-- order by random() limit sample_size; +-- return; +-- end +-- $$ +-- language plpgsql;