diff --git a/sql/crawler.sql b/sql/crawler.sql deleted file mode 100644 index 0a30b54aaac383562caee9e8304ee824150543c2..0000000000000000000000000000000000000000 --- a/sql/crawler.sql +++ /dev/null @@ -1,106 +0,0 @@ - --- -- return a random sample of repos, containing %percent repositories --- create or replace function repos_random_sample_array(percent real) --- returns setof repos as $$ --- declare --- samples integer; --- repo repos%rowtype; --- ids integer[]; --- begin --- select floor(count(*) / 100 * percent) into samples from repos; --- ids := array(select id from repos order by id); --- for i in 1 .. samples loop --- select * into repo --- from repos --- where id = ids[round(random() * samples)]; --- return next repo; --- end loop; --- return; --- end --- $$ --- language plpgsql; - --- return a random sample of repositories -create or replace function repos_random_sample(percent real) -returns setof repos as $$ -declare - sample_size integer; -begin - select floor(count(*) / 100 * percent) into sample_size from repos; - return query - select * from repos - order by random() - limit sample_size; - return; -end -$$ -language plpgsql; - --- -- return a random sample of repositories --- create or replace function random_sample_sequence(percent real) --- returns setof repos as $$ --- declare --- sample_size integer; --- seq_size integer; --- min_id integer; --- max_id integer; --- begin --- select floor(count(*) / 100 * percent) into sample_size from repos; --- select min(id) into min_id from repos; --- select max(id) into max_id from repos; --- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence --- -- to have enough of them --- return query --- select * from repos --- where id in --- (select floor(random() * (max_id - min_id + 1))::integer --- + min_id --- from generate_series(1, seq_size)) --- order by random() limit sample_size; --- return; --- end --- $$ --- language plpgsql; - -create or replace function repos_well_known() -returns setof repos as $$ -begin - return query - select * from repos - where full_name like 'apache/%' - or full_name like 'eclipse/%' - or full_name like 'mozilla/%' - or full_name = 'torvalds/linux' - or full_name = 'gcc-mirror/gcc'; - return; -end -$$ -language plpgsql; - -create table crawl_history ( - id bigserial primary key, - repo integer references repos(id), - task_id uuid, -- celery task id - date timestamptz not null, - duration interval, - status boolean, - result json, - stdout text, - stderr text -); - -create index on crawl_history (repo); - -create view missing_orig_repos AS - select * - from orig_repos as repos - where not exists - (select 1 from crawl_history as history - where history.repo = repos.id); - -create view missing_fork_repos AS - select * - from fork_repos as repos - where not exists - (select 1 from crawl_history as history - where history.repo = repos.id); diff --git a/sql/pimp_db.sql b/sql/pimp_db.sql deleted file mode 100644 index 2cc9cef7ceee4026fc083cfb8a51246189a6f2e5..0000000000000000000000000000000000000000 --- a/sql/pimp_db.sql +++ /dev/null @@ -1,36 +0,0 @@ - -create view orig_repos as - select id, name, full_name, html_url, description, last_seen - from repos - where not fork; - -create view fork_repos as - select id, name, full_name, html_url, description, last_seen - from repos - where fork - -create extension pg_trgm; - -create index ix_trgm_repos_description on - repos using gin (description gin_trgm_ops); - -create index ix_trgm_repos_full_name on - repos using gin (full_name gin_trgm_ops); - -create table repos_history ( - ts timestamp default current_timestamp, - repos integer not null, - fork_repos integer, - orig_repos integer -); - -create view repo_creations as - select today.ts :: date as date, - today.repos - yesterday.repos as repos, - today.fork_repos - yesterday.fork_repos as fork_repos, - today.orig_repos - yesterday.orig_repos as orig_repos - from repos_history today - join repos_history yesterday on - (yesterday.ts = (select max(ts) - from repos_history - where ts < today.ts));