Revert to the pre-qless refactoring version

This commit is contained in:
Nicolas Dandrimont 2016-09-13 14:57:26 +02:00
parent 9550ca3edd
commit 2a62db6827
13 changed files with 582 additions and 49 deletions

24
README
View file

@ -18,20 +18,30 @@ along with this program.
Dependencies
============
See requirements.txt
- python3
- python3-psycopg2
- python3-requests
- python3-sqlalchemy
Deployment
==========
The github lister can be run standalone by using `python3 -m swh.lister.github.lister`.
1. git clone under $GHLISTER_ROOT (of your choosing)
2. mkdir ~/.config/swh/ ~/.cache/swh/lister-github/
3. edit $GHLISTER_ROOT/etc/crontab and customize GHLISTER_ROOT
4. crontab $GHLISTER_ROOT/etc/crontab
5. create configuration file ~/.config/swh/lister-github.ini
Sample configuration file
-------------------------
cat ~/.config/swh/lister/github.ini
cat ~/.config/swh/lister-github.ini
[main]
storage_class = local_storage
storage_args = dbname=softwareheritage-dev, /srv/softwareheritage/objects
queue_file = ~/.cache/swh/lister-github/queue.pickle
credentials = olasd:olasd_github_token, zacchiro:zacchiro_github_token
db_url = postgres:///github
# see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
cache_dir = /home/zack/.cache/swh/lister-github
log_dir = /home/zack/.cache/swh/lister-github
username = foobar # github username
password = quux # github password

8
TODO
View file

@ -1,5 +1,13 @@
# -*- mode: org -*-
* TODO SQL: rework repo_history/repo_creations to use last_seen
* TODO cache dir: split json data from other HTTP info
for easier further processing of additional API data
* TODO cache dir: split in subdirs
to avoid hitting too hard on the filesystem due to the large amount of files
(200k+)
* TODO network-level traceback
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/response.py", line 186, in read

41
bin/batch Executable file
View file

@ -0,0 +1,41 @@
#!/bin/bash
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
export https_proxy="127.0.0.1:8118" # use Tor
export PYTHONPATH=`pwd`
DBNAME=github
DBCONN="-p 5433"
psql="psql $DBCONN --no-psqlrc --pset t --pset format=unaligned ${DBNAME}"
BATCH_NO="$1"
shift
if [ -z "$BATCH_NO" ] ; then
echo "Usage: batch MILLION_NO [ MIN_ID | continue ]"
exit 2
fi
MIN_ID="$1"
shift
min_id=$[ ($BATCH_NO - 1) * 1000000 + 1 ]
max_id=$[ $BATCH_NO * 1000000 ]
# allow min_id override on the command line
if [ "$MIN_ID" = "continue" ] ; then
last_id=$(echo "select max(id) from repos where ${min_id} <= id and id <= ${max_id}" | $psql)
if [ "$last_id" -eq "$last_id" ] 2> /dev/null ; then # is an integer?
echo "Continuing from last known id ${last_id}"
min_id=$last_id
fi
elif [ -n "$MIN_ID" ] ; then
min_id=$[ $MIN_ID > $min_id ? $MIN_ID : $min_id ]
fi
cmd="bin/ghlister list ${min_id}-${max_id}"
echo Running $cmd ...
$cmd

135
bin/ghlister Executable file
View file

@ -0,0 +1,135 @@
#!/usr/bin/python3
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import argparse
import configparser
import logging
import os
import sys
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from swh.lister.github import lister, models
from swh.lister.github.db_utils import session_scope
DEFAULT_CONF = {
'cache_dir': './cache',
'log_dir': './log',
'cache_json': 'False',
}
def db_connect(db_url):
engine = create_engine(db_url)
session = sessionmaker(bind=engine)
return (engine, session)
def int_interval(s):
"""parse an "N-M" string as an interval.
Return an (N,M) int (or None) pair
"""
def not_an_interval():
raise argparse.ArgumentTypeError('not an interval: ' + s)
def parse_int(s):
if s:
return int(s)
else:
return None
if '-' not in s:
not_an_interval()
parts = s.split('-')
if len(parts) > 2:
not_an_interval()
return tuple([parse_int(p) for p in parts])
def parse_args():
cli = argparse.ArgumentParser(
description='list GitHub repositories and load them into a DB')
cli.add_argument('--db-url', '-d', metavar='SQLALCHEMY_URL',
help='SQLAlchemy DB URL (override conffile); see '
'<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # NOQA
cli.add_argument('--verbose', '-v', action='store_true',
help='be verbose')
subcli = cli.add_subparsers(dest='action')
subcli.add_parser('createdb', help='initialize DB')
subcli.add_parser('dropdb', help='destroy DB')
list_cli = subcli.add_parser('list', help='list repositories')
list_cli.add_argument('interval',
type=int_interval,
help='interval of repository IDs to list, '
'in N-M format; either N or M can be omitted.')
list_cli = subcli.add_parser('catchup',
help='catchup with new repos since last time')
args = cli.parse_args()
if not args.action:
cli.error('no action given')
return args
def read_conf(args):
config = configparser.ConfigParser(defaults=DEFAULT_CONF)
config.read(os.path.expanduser('~/.config/swh/lister-github.ini'))
conf = config._sections['main']
# overrides
if args.db_url:
conf['db_url'] = args.db_url
# typing
if 'cache_json' in conf and conf['cache_json'].lower() == 'true':
conf['cache_json'] = True
else:
conf['cache_json'] = False
return conf
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) # XXX
args = parse_args()
conf = read_conf(args)
db_engine, mk_session = db_connect(conf['db_url'])
if args.action == 'createdb':
models.SQLBase.metadata.create_all(db_engine)
elif args.action == 'dropdb':
models.SQLBase.metadata.drop_all(db_engine)
elif args.action == 'list':
lister.fetch(conf,
mk_session,
min_id=args.interval[0],
max_id=args.interval[1])
elif args.action == 'catchup':
with session_scope(mk_session) as db_session:
last_known_id = lister.last_repo_id(db_session)
if last_known_id is not None:
logging.info('catching up from last known repo id: %d' %
last_known_id)
lister.fetch(conf,
mk_session,
min_id=last_known_id + 1,
max_id=None)
else:
logging.error('Cannot catchup: no last known id found. Abort.')
sys.exit(2)

9
bin/reset.sh Normal file
View file

@ -0,0 +1,9 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
export PYTHONPATH=`pwd`
dropdb github
createdb github
bin/ghlister createdb
rm cache/*

18
bin/status Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# intended usage: watch -n 60 bin/status
DBNAME="github"
DBCONN="-p 5433"
psql="psql $DBCONN --no-psqlrc"
ps auxw | grep bin/batch
echo "SELECT COUNT(*), MAX(id) FROM repos" | $psql "$DBNAME"
echo "\\l+ ${DBNAME}" | $psql "$DBNAME"
du -sh cache/
zgrep -i --color=auto "'X-RateLimit-Remaining'" cache/$(ls -t cache/ | head -n 4 | tail -n 1)

5
etc/crontab Normal file
View file

@ -0,0 +1,5 @@
SHELL=/bin/bash
GHLISTER_ROOT=/home/zack/src/swh-lister-github
# m h dom mon dow command
0 8 * * * PYTHONPATH=$GHLISTER_ROOT $GHLISTER_ROOT/bin/ghlister catchup >> ~/.cache/swh/lister-github/$(date +\%Y\%m\%d).log 2>&1

View file

@ -1,7 +1,2 @@
python-dateutil
qless-py
requests
redis
swh.core
swh.storage
SQLAlchemy

106
sql/crawler.sql Normal file
View file

@ -0,0 +1,106 @@
-- -- return a random sample of repos, containing %percent repositories
-- create or replace function repos_random_sample_array(percent real)
-- returns setof repos as $$
-- declare
-- samples integer;
-- repo repos%rowtype;
-- ids integer[];
-- begin
-- select floor(count(*) / 100 * percent) into samples from repos;
-- ids := array(select id from repos order by id);
-- for i in 1 .. samples loop
-- select * into repo
-- from repos
-- where id = ids[round(random() * samples)];
-- return next repo;
-- end loop;
-- return;
-- end
-- $$
-- language plpgsql;
-- return a random sample of repositories
create or replace function repos_random_sample(percent real)
returns setof repos as $$
declare
sample_size integer;
begin
select floor(count(*) / 100 * percent) into sample_size from repos;
return query
select * from repos
order by random()
limit sample_size;
return;
end
$$
language plpgsql;
-- -- return a random sample of repositories
-- create or replace function random_sample_sequence(percent real)
-- returns setof repos as $$
-- declare
-- sample_size integer;
-- seq_size integer;
-- min_id integer;
-- max_id integer;
-- begin
-- select floor(count(*) / 100 * percent) into sample_size from repos;
-- select min(id) into min_id from repos;
-- select max(id) into max_id from repos;
-- seq_size := sample_size * 3; -- IDs are sparse, generate a larger sequence
-- -- to have enough of them
-- return query
-- select * from repos
-- where id in
-- (select floor(random() * (max_id - min_id + 1))::integer
-- + min_id
-- from generate_series(1, seq_size))
-- order by random() limit sample_size;
-- return;
-- end
-- $$
-- language plpgsql;
create or replace function repos_well_known()
returns setof repos as $$
begin
return query
select * from repos
where full_name like 'apache/%'
or full_name like 'eclipse/%'
or full_name like 'mozilla/%'
or full_name = 'torvalds/linux'
or full_name = 'gcc-mirror/gcc';
return;
end
$$
language plpgsql;
create table crawl_history (
id bigserial primary key,
repo integer references repos(id),
task_id uuid, -- celery task id
date timestamptz not null,
duration interval,
status boolean,
result json,
stdout text,
stderr text
);
create index on crawl_history (repo);
create view missing_orig_repos AS
select *
from orig_repos as repos
where not exists
(select 1 from crawl_history as history
where history.repo = repos.id);
create view missing_fork_repos AS
select *
from fork_repos as repos
where not exists
(select 1 from crawl_history as history
where history.repo = repos.id);

36
sql/pimp_db.sql Normal file
View file

@ -0,0 +1,36 @@
create view orig_repos as
select id, name, full_name, html_url, description, last_seen
from repos
where not fork;
create view fork_repos as
select id, name, full_name, html_url, description, last_seen
from repos
where fork
create extension pg_trgm;
create index ix_trgm_repos_description on
repos using gin (description gin_trgm_ops);
create index ix_trgm_repos_full_name on
repos using gin (full_name gin_trgm_ops);
create table repos_history (
ts timestamp default current_timestamp,
repos integer not null,
fork_repos integer,
orig_repos integer
);
create view repo_creations as
select today.ts :: date as date,
today.repos - yesterday.repos as repos,
today.fork_repos - yesterday.fork_repos as fork_repos,
today.orig_repos - yesterday.orig_repos as orig_repos
from repos_history today
join repos_history yesterday on
(yesterday.ts = (select max(ts)
from repos_history
where ts < today.ts));

View file

@ -0,0 +1,18 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from contextlib import contextmanager
@contextmanager
def session_scope(mk_session):
session = mk_session()
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()

View file

@ -1,53 +1,161 @@
# Copyright © 2016 The Software Heritage Developers <swh-devel@inria.fr>
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# see https://developer.github.com/v3/ for GitHub API documentation
import gzip
import logging
import os
import re
import requests
import time
from swh.core.config import load_named_config
from swh.storage import get_storage
from pprint import pformat
from sqlalchemy import func
from . import req_queue, processors, cache
DEFAULT_CONFIG = {
'queue_url': ('str', 'redis://localhost'),
'cache_url': ('str', 'redis://localhost'),
'storage_class': ('str', 'local_storage'),
'storage_args': ('list[str]', ['dbname=softwareheritage-dev',
'/srv/softwareheritage/objects']),
'credentials': ('list[str]', []),
}
CONFIG_NAME = 'lister/github.ini'
from swh.lister.github.db_utils import session_scope
from swh.lister.github.models import Repository
def run_from_queue():
config = load_named_config(CONFIG_NAME, DEFAULT_CONFIG)
GH_API_URL = 'https://api.github.com'
MAX_RETRIES = 7
MAX_SLEEP = 3600 # 1 hour
CONN_SLEEP = 10
cache.init_cache(config['cache_url'])
REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)')
queue_url = os.path.expanduser(config['queue_url'])
credentials = {}
for credential in config['credentials']:
login, token = credential.split(':')
credentials[login] = {'token': token}
def save_http_response(r, cache_dir):
def escape_url_path(p):
return p.replace('/', '__')
queue = req_queue.from_url(queue_url)
fname = os.path.join(cache_dir,
escape_url_path(r.request.path_url) + '.gz')
with gzip.open(fname, 'w') as f:
def emit(s):
f.write(bytes(s, 'UTF-8'))
emit(pformat(r.request.path_url))
emit('\n#\n')
emit(pformat(r.status_code))
emit('\n#\n')
emit(pformat(r.headers))
emit('\n#\n')
emit(pformat(r.json()))
if req_queue.empty(queue):
req_queue.push(queue, {'type': 'repositories', 'url': None})
session = requests.Session()
storage = get_storage(config['storage_class'], config['storage_args'])
def gh_api_request(path, username=None, password=None, headers={}):
params = {}
if 'Accept' not in headers: # request version 3 of the API
headers['Accept'] = 'application/vnd.github.v3+json'
params['headers'] = headers
if username is not None and password is not None:
params['auth'] = (username, password)
while not req_queue.empty(queue):
processors.process_one_item(
queue, session=session, credentials=credentials,
storage=storage
)
retries_left = MAX_RETRIES
while retries_left > 0:
logging.debug('sending API request: %s' % path)
try:
r = requests.get(GH_API_URL + path, **params)
except requests.exceptions.ConnectionError:
# network-level connection error, try again
logging.warn('connection error upon %s: sleep for %d seconds' %
(path, CONN_SLEEP))
time.sleep(CONN_SLEEP)
retries_left -= 1
continue
if __name__ == '__main__':
run_from_queue()
if r.ok: # all went well, do not retry
break
# detect throttling
if r.status_code == 403 and \
int(r.headers['X-RateLimit-Remaining']) == 0:
delay = int(r.headers['X-RateLimit-Reset']) - time.time()
delay = min(delay, MAX_SLEEP)
logging.warn('rate limited upon %s: sleep for %d seconds' %
(path, int(delay)))
time.sleep(delay)
else: # unexpected error, abort
break
retries_left -= 1
if not retries_left:
logging.warn('giving up on %s: max retries exceed' % path)
return r
def lookup_repo(db_session, repo_id):
return db_session.query(Repository) \
.filter(Repository.id == repo_id) \
.first()
def last_repo_id(db_session):
t = db_session.query(func.max(Repository.id)) \
.first()
if t is not None:
return t[0]
# else: return None
INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description', 'fork']
def inject_repo(db_session, repo):
logging.debug('injecting repo %d' % repo['id'])
if lookup_repo(db_session, repo['id']):
logging.info('not injecting already present repo %d' % repo['id'])
return
kwargs = {k: repo[k] for k in INJECT_KEYS if k in repo}
sql_repo = Repository(**kwargs)
db_session.add(sql_repo)
class FetchError(RuntimeError):
def __init__(self, response):
self.response = response
def __str__(self):
return repr(self.response)
def fetch(conf, mk_session, min_id=None, max_id=None):
if min_id is None:
min_id = 1
if max_id is None:
max_id = float('inf')
next_id = min_id
cred = {}
for key in ['username', 'password']:
if key in conf:
cred[key] = conf[key]
while min_id <= next_id <= max_id:
logging.info('listing repos starting at %d' % next_id)
since = next_id - 1 # github API ?since=... is '>' strict, not '>='
repos_res = gh_api_request('/repositories?since=%d' % since, **cred)
if 'cache_dir' in conf and conf['cache_json']:
save_http_response(repos_res, conf['cache_dir'])
if not repos_res.ok:
raise FetchError(repos_res)
repos = repos_res.json()
for repo in repos:
if repo['id'] > max_id: # do not overstep max_id
break
with session_scope(mk_session) as db_session:
inject_repo(db_session, repo)
if 'next' in repos_res.links:
next_url = repos_res.links['next']['url']
m = REPO_API_URL_RE.match(next_url) # parse next_id
next_id = int(m.group(1)) + 1
else:
logging.info('stopping after id %d, no next link found' % next_id)
break

View file

@ -0,0 +1,44 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime
from sqlalchemy import Column
from sqlalchemy import Boolean, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base
SQLBase = declarative_base()
class Repository(SQLBase):
"""a GitHub repository"""
__tablename__ = 'repos'
id = Column(Integer, primary_key=True)
name = Column(String, index=True)
full_name = Column(String, index=True)
html_url = Column(String)
description = Column(String)
fork = Column(Boolean, index=True)
last_seen = Column(DateTime, nullable=False)
def __init__(self, id, name=None, full_name=None, html_url=None,
description=None, fork=None):
self.id = id
self.last_seen = datetime.now()
if name is not None:
self.name = name
if full_name is not None:
self.full_name = full_name
if html_url is not None:
self.html_url = html_url
if description is not None:
self.description = description
if fork is not None:
self.fork = fork