initial check in

This commit is contained in:
Stefano Zacchiroli 2015-04-26 10:32:17 +02:00
commit e44226544a
7 changed files with 327 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
cache/

26
bin/batch Executable file
View file

@ -0,0 +1,26 @@
#!/bin/bash
PROXY="127.0.0.1:8118" # use Tor
BATCH_NO="$1"
shift
if [ -z "$BATCH_NO" ] ; then
echo "Usage: batch MILLION_NO [MIN_ID]"
exit 2
fi
MIN_ID="$1"
shift
export https_proxy=$PROXY
export PYTHONPATH=`pwd`
min_id=$[ ($BATCH_NO - 1) * 1000000 + 1 ]
max_id=$[ $BATCH_NO * 1000000 ]
# allow min_id override on the command line
min_id=$[ $MIN_ID > $min_id ? $MIN_ID : $min_id ]
cmd="bin/ghlister list ${min_id}-${max_id}"
echo Running $cmd ...
$cmd

106
bin/ghlister Executable file
View file

@ -0,0 +1,106 @@
#!/usr/bin/python3
import argparse
import configparser
import logging
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from ghlister import lister, models
DEFAULT_CONF = {
'cache_dir': './cache',
'log_dir': './log',
}
# where to store raw json answers, for further processing/re-injection
CACHE_DIR = 'cache'
def db_connect(db_url):
engine = create_engine(db_url)
Session = sessionmaker(bind=engine)
return (engine, Session)
def int_interval(s):
"""parse an "N-M" string as an interval.
Return an (N,M) int (or None) pair
"""
def not_an_interval():
raise argparse.ArgumentTypeError('not an interval: ' + s)
def parse_int(s):
if s:
return int(s)
else:
return None
if '-' not in s:
not_an_interval()
parts = s.split('-')
if len(parts) > 2:
not_an_interval()
return tuple([parse_int(p) for p in parts])
def parse_args():
cli = argparse.ArgumentParser(
description='list GitHub repositories and load them into a DB')
cli.add_argument('--db-url', '-d', metavar='SQLALCHEMY_URL',
help='SQLAlchemy DB URL (override conffile); see '
+ '<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # NOQA
cli.add_argument('--verbose', '-v', action='store_true',
help='be verbose')
subcli = cli.add_subparsers(dest='action')
subcli.add_parser('createdb', help='initialize DB')
subcli.add_parser('dropdb', help='destroy DB')
list_cli = subcli.add_parser('list', help='list repositories')
list_cli.add_argument('interval',
type=int_interval,
help='interval of repository IDs to list, '
+ 'in N-M format; either N or M can be omitted.')
args = cli.parse_args()
return args
def read_conf(args):
config = configparser.ConfigParser(defaults=DEFAULT_CONF)
config.read(os.path.expanduser('~/.config/ghlister.ini'))
conf = config._sections['main']
# overrides
if args.db_url:
conf['db_url'] = args.db_url
return conf
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) # XXX
args = parse_args()
conf = read_conf(args)
db_engine, mk_session = db_connect(conf['db_url'])
if args.action == 'createdb':
models.SQLBase.metadata.create_all(db_engine)
elif args.action == 'dropdb':
models.SQLBase.metadata.drop_all(db_engine)
elif args.action == 'list':
lister.fetch(conf,
mk_session,
min_id=args.interval[0],
max_id=args.interval[1])

1
ghlister/__init__.py Normal file
View file

@ -0,0 +1 @@
# placeholder

15
ghlister/db_utils.py Normal file
View file

@ -0,0 +1,15 @@
from contextlib import contextmanager
@contextmanager
def session_scope(mk_session):
session = mk_session()
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()

137
ghlister/lister.py Normal file
View file

@ -0,0 +1,137 @@
# see https://developer.github.com/v3/ for GitHub API documentation
import gzip
import logging
import os
import re
import requests
import time
from pprint import pformat
from ghlister.db_utils import session_scope
from ghlister.models import Repository
GH_API_URL = 'https://api.github.com'
MAX_RETRIES = 5
MAX_SLEEP = 3600 # 1 hour
REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)')
def save_http_response(r, cache_dir):
escape_url_path = lambda p: p.replace('/', '__')
fname = os.path.join(cache_dir,
escape_url_path(r.request.path_url) + '.gz')
with gzip.open(fname, 'w') as f:
emit = lambda s: f.write(bytes(s, 'UTF-8'))
emit(pformat(r.request.path_url))
emit('\n#\n')
emit(pformat(r.status_code))
emit('\n#\n')
emit(pformat(r.headers))
emit('\n#\n')
emit(pformat(r.json()))
def gh_api_request(path, username=None, password=None, headers={}):
params = {}
if 'Accept' not in headers: # request version 3 of the API
headers['Accept'] = 'application/vnd.github.v3+json'
params['headers'] = headers
if username is not None and password is not None:
params['auth'] = (username, password)
retries_left = MAX_RETRIES
while retries_left > 0:
logging.debug('sending API request: %s' % path)
r = requests.get(GH_API_URL + path, **params)
if r.ok: # all went well, do not retry
break
# detect throttling
if r.status_code == 403 and \
int(r.headers['X-RateLimit-Remaining']) == 0:
delay = int(r.headers['X-RateLimit-Reset']) - time.time()
delay = min(delay, MAX_SLEEP)
logging.warn('rate limited upon %s: sleep for %d seconds' %
(path, int(delay)))
time.sleep(delay)
else: # unexpected error, abort
break
retries_left -= 1
if not retries_left:
logging.warn('giving up on %s: max retries exceed' % path)
return r
def lookup_repo(db_session, repo_id):
return db_session.query(Repository) \
.filter(Repository.id == repo_id) \
.first()
INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description', 'fork']
def inject_repo(db_session, repo):
logging.debug('injecting repo %d' % repo['id'])
if lookup_repo(db_session, repo['id']):
logging.info('not injecting already present repo %d' % repo['id'])
return
kwargs = {k: repo[k] for k in INJECT_KEYS if k in repo}
sql_repo = Repository(**kwargs)
db_session.add(sql_repo)
class FetchError(RuntimeError):
def __init__(self, response):
self.response = response
def __str__(self):
return repr(self.response)
def fetch(conf, mk_session, min_id=None, max_id=None):
if min_id is None:
min_id = 1
if max_id is None:
max_id = float('inf')
next_id = min_id
cred = {}
for key in ['username', 'password']:
if key in conf:
cred[key] = conf[key]
while min_id <= next_id <= max_id:
logging.info('listing repos starting at %d' % (next_id - 1))
# "- 1" because ?since=... is '<' strict, not '<='
repos_res = gh_api_request('/repositories?since=%d' % next_id, **cred)
if 'cache_dir' in conf:
save_http_response(repos_res, conf['cache_dir'])
if not repos_res.ok:
raise FetchError(repos_res)
repos = repos_res.json()
for repo in repos:
if repo['id'] > max_id: # do not overstep max_id
break
with session_scope(mk_session) as db_session:
inject_repo(db_session, repo)
if 'next' in repos_res.links:
next_url = repos_res.links['next']['url']
m = REPO_API_URL_RE.match(next_url) # parse next_id
next_id = int(m.group(1))
else:
logging.info('stopping after id %d, no next link found' % next_id)
break

41
ghlister/models.py Normal file
View file

@ -0,0 +1,41 @@
from datetime import datetime
from sqlalchemy import Column
from sqlalchemy import Boolean, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base
SQLBase = declarative_base()
class Repository(SQLBase):
"""a GitHub repository"""
__tablename__ = 'repos'
id = Column(Integer, primary_key=True)
name = Column(String, index=True)
full_name = Column(String, index=True)
html_url = Column(String)
description = Column(String)
fork = Column(Boolean, index=True)
last_seen = Column(DateTime, nullable=False)
def __init__(self, id, name=None, full_name=None, html_url=None,
description=None, fork=None):
self.id = id
self.last_seen = datetime.now()
if name is not None:
self.name = name
if full_name is not None:
self.full_name = full_name
if html_url is not None:
self.html_url = html_url
if description is not None:
self.description = description
if fork is not None:
self.fork = fork