commit e44226544aeee9b794afbd8834f07624a799ff02 Author: Stefano Zacchiroli Date: Sun Apr 26 10:32:17 2015 +0200 initial check in diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e934adf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +cache/ diff --git a/bin/batch b/bin/batch new file mode 100755 index 0000000..5f87aef --- /dev/null +++ b/bin/batch @@ -0,0 +1,26 @@ +#!/bin/bash + +PROXY="127.0.0.1:8118" # use Tor + +BATCH_NO="$1" +shift +if [ -z "$BATCH_NO" ] ; then + echo "Usage: batch MILLION_NO [MIN_ID]" + exit 2 +fi + +MIN_ID="$1" +shift + +export https_proxy=$PROXY +export PYTHONPATH=`pwd` + +min_id=$[ ($BATCH_NO - 1) * 1000000 + 1 ] +max_id=$[ $BATCH_NO * 1000000 ] + +# allow min_id override on the command line +min_id=$[ $MIN_ID > $min_id ? $MIN_ID : $min_id ] + +cmd="bin/ghlister list ${min_id}-${max_id}" +echo Running $cmd ... +$cmd diff --git a/bin/ghlister b/bin/ghlister new file mode 100755 index 0000000..a5b0207 --- /dev/null +++ b/bin/ghlister @@ -0,0 +1,106 @@ +#!/usr/bin/python3 + +import argparse +import configparser +import logging +import os +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from ghlister import lister, models + + +DEFAULT_CONF = { + 'cache_dir': './cache', + 'log_dir': './log', +} + + +# where to store raw json answers, for further processing/re-injection +CACHE_DIR = 'cache' + + +def db_connect(db_url): + engine = create_engine(db_url) + Session = sessionmaker(bind=engine) + + return (engine, Session) + + +def int_interval(s): + """parse an "N-M" string as an interval. + + Return an (N,M) int (or None) pair + + """ + def not_an_interval(): + raise argparse.ArgumentTypeError('not an interval: ' + s) + + def parse_int(s): + if s: + return int(s) + else: + return None + + if '-' not in s: + not_an_interval() + parts = s.split('-') + if len(parts) > 2: + not_an_interval() + return tuple([parse_int(p) for p in parts]) + + +def parse_args(): + cli = argparse.ArgumentParser( + description='list GitHub repositories and load them into a DB') + cli.add_argument('--db-url', '-d', metavar='SQLALCHEMY_URL', + help='SQLAlchemy DB URL (override conffile); see ' + + '') # NOQA + cli.add_argument('--verbose', '-v', action='store_true', + help='be verbose') + + subcli = cli.add_subparsers(dest='action') + subcli.add_parser('createdb', help='initialize DB') + subcli.add_parser('dropdb', help='destroy DB') + + list_cli = subcli.add_parser('list', help='list repositories') + list_cli.add_argument('interval', + type=int_interval, + help='interval of repository IDs to list, ' + + 'in N-M format; either N or M can be omitted.') + + args = cli.parse_args() + + return args + + +def read_conf(args): + config = configparser.ConfigParser(defaults=DEFAULT_CONF) + config.read(os.path.expanduser('~/.config/ghlister.ini')) + + conf = config._sections['main'] + + # overrides + if args.db_url: + conf['db_url'] = args.db_url + + return conf + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) # XXX + + args = parse_args() + conf = read_conf(args) + + db_engine, mk_session = db_connect(conf['db_url']) + + if args.action == 'createdb': + models.SQLBase.metadata.create_all(db_engine) + elif args.action == 'dropdb': + models.SQLBase.metadata.drop_all(db_engine) + elif args.action == 'list': + lister.fetch(conf, + mk_session, + min_id=args.interval[0], + max_id=args.interval[1]) diff --git a/ghlister/__init__.py b/ghlister/__init__.py new file mode 100644 index 0000000..fdffa2a --- /dev/null +++ b/ghlister/__init__.py @@ -0,0 +1 @@ +# placeholder diff --git a/ghlister/db_utils.py b/ghlister/db_utils.py new file mode 100644 index 0000000..0997f50 --- /dev/null +++ b/ghlister/db_utils.py @@ -0,0 +1,15 @@ + +from contextlib import contextmanager + + +@contextmanager +def session_scope(mk_session): + session = mk_session() + try: + yield session + session.commit() + except: + session.rollback() + raise + finally: + session.close() diff --git a/ghlister/lister.py b/ghlister/lister.py new file mode 100644 index 0000000..6145e3e --- /dev/null +++ b/ghlister/lister.py @@ -0,0 +1,137 @@ + +# see https://developer.github.com/v3/ for GitHub API documentation + +import gzip +import logging +import os +import re +import requests +import time + +from pprint import pformat + +from ghlister.db_utils import session_scope +from ghlister.models import Repository + + +GH_API_URL = 'https://api.github.com' +MAX_RETRIES = 5 +MAX_SLEEP = 3600 # 1 hour + +REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)') + + +def save_http_response(r, cache_dir): + escape_url_path = lambda p: p.replace('/', '__') + fname = os.path.join(cache_dir, + escape_url_path(r.request.path_url) + '.gz') + with gzip.open(fname, 'w') as f: + emit = lambda s: f.write(bytes(s, 'UTF-8')) + emit(pformat(r.request.path_url)) + emit('\n#\n') + emit(pformat(r.status_code)) + emit('\n#\n') + emit(pformat(r.headers)) + emit('\n#\n') + emit(pformat(r.json())) + + +def gh_api_request(path, username=None, password=None, headers={}): + params = {} + if 'Accept' not in headers: # request version 3 of the API + headers['Accept'] = 'application/vnd.github.v3+json' + params['headers'] = headers + if username is not None and password is not None: + params['auth'] = (username, password) + + retries_left = MAX_RETRIES + while retries_left > 0: + logging.debug('sending API request: %s' % path) + r = requests.get(GH_API_URL + path, **params) + + if r.ok: # all went well, do not retry + break + + # detect throttling + if r.status_code == 403 and \ + int(r.headers['X-RateLimit-Remaining']) == 0: + delay = int(r.headers['X-RateLimit-Reset']) - time.time() + delay = min(delay, MAX_SLEEP) + logging.warn('rate limited upon %s: sleep for %d seconds' % + (path, int(delay))) + time.sleep(delay) + else: # unexpected error, abort + break + + retries_left -= 1 + + if not retries_left: + logging.warn('giving up on %s: max retries exceed' % path) + + return r + + +def lookup_repo(db_session, repo_id): + return db_session.query(Repository) \ + .filter(Repository.id == repo_id) \ + .first() + + +INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description', 'fork'] + + +def inject_repo(db_session, repo): + logging.debug('injecting repo %d' % repo['id']) + if lookup_repo(db_session, repo['id']): + logging.info('not injecting already present repo %d' % repo['id']) + return + kwargs = {k: repo[k] for k in INJECT_KEYS if k in repo} + sql_repo = Repository(**kwargs) + db_session.add(sql_repo) + + +class FetchError(RuntimeError): + + def __init__(self, response): + self.response = response + + def __str__(self): + return repr(self.response) + + +def fetch(conf, mk_session, min_id=None, max_id=None): + if min_id is None: + min_id = 1 + if max_id is None: + max_id = float('inf') + next_id = min_id + + cred = {} + for key in ['username', 'password']: + if key in conf: + cred[key] = conf[key] + + while min_id <= next_id <= max_id: + logging.info('listing repos starting at %d' % (next_id - 1)) + # "- 1" because ?since=... is '<' strict, not '<=' + repos_res = gh_api_request('/repositories?since=%d' % next_id, **cred) + + if 'cache_dir' in conf: + save_http_response(repos_res, conf['cache_dir']) + if not repos_res.ok: + raise FetchError(repos_res) + + repos = repos_res.json() + for repo in repos: + if repo['id'] > max_id: # do not overstep max_id + break + with session_scope(mk_session) as db_session: + inject_repo(db_session, repo) + + if 'next' in repos_res.links: + next_url = repos_res.links['next']['url'] + m = REPO_API_URL_RE.match(next_url) # parse next_id + next_id = int(m.group(1)) + else: + logging.info('stopping after id %d, no next link found' % next_id) + break diff --git a/ghlister/models.py b/ghlister/models.py new file mode 100644 index 0000000..ad867a7 --- /dev/null +++ b/ghlister/models.py @@ -0,0 +1,41 @@ + +from datetime import datetime + +from sqlalchemy import Column +from sqlalchemy import Boolean, DateTime, Integer, String +from sqlalchemy.ext.declarative import declarative_base + + +SQLBase = declarative_base() + + +class Repository(SQLBase): + + """a GitHub repository""" + + __tablename__ = 'repos' + + id = Column(Integer, primary_key=True) + + name = Column(String, index=True) + full_name = Column(String, index=True) + html_url = Column(String) + description = Column(String) + fork = Column(Boolean, index=True) + + last_seen = Column(DateTime, nullable=False) + + def __init__(self, id, name=None, full_name=None, html_url=None, + description=None, fork=None): + self.id = id + self.last_seen = datetime.now() + if name is not None: + self.name = name + if full_name is not None: + self.full_name = full_name + if html_url is not None: + self.html_url = html_url + if description is not None: + self.description = description + if fork is not None: + self.fork = fork