diff --git a/bin/ghlister b/bin/ghlister index aceeeb0..da4c5d9 100755 --- a/bin/ghlister +++ b/bin/ghlister @@ -1,135 +1,135 @@ #!/usr/bin/python3 # Copyright (C) 2015 Stefano Zacchiroli # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import configparser import logging import os import sys from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from ghlister import lister, models -from ghlister.db_utils import session_scope +from swh.lister.github import lister, models +from swh.lister.github.db_utils import session_scope DEFAULT_CONF = { 'cache_dir': './cache', 'log_dir': './log', 'cache_json': 'False', } def db_connect(db_url): engine = create_engine(db_url) session = sessionmaker(bind=engine) return (engine, session) def int_interval(s): """parse an "N-M" string as an interval. Return an (N,M) int (or None) pair """ def not_an_interval(): raise argparse.ArgumentTypeError('not an interval: ' + s) def parse_int(s): if s: return int(s) else: return None if '-' not in s: not_an_interval() parts = s.split('-') if len(parts) > 2: not_an_interval() return tuple([parse_int(p) for p in parts]) def parse_args(): cli = argparse.ArgumentParser( description='list GitHub repositories and load them into a DB') cli.add_argument('--db-url', '-d', metavar='SQLALCHEMY_URL', help='SQLAlchemy DB URL (override conffile); see ' '') # NOQA cli.add_argument('--verbose', '-v', action='store_true', help='be verbose') subcli = cli.add_subparsers(dest='action') subcli.add_parser('createdb', help='initialize DB') subcli.add_parser('dropdb', help='destroy DB') list_cli = subcli.add_parser('list', help='list repositories') list_cli.add_argument('interval', type=int_interval, help='interval of repository IDs to list, ' 'in N-M format; either N or M can be omitted.') list_cli = subcli.add_parser('catchup', help='catchup with new repos since last time') args = cli.parse_args() if not args.action: cli.error('no action given') return args def read_conf(args): config = configparser.ConfigParser(defaults=DEFAULT_CONF) config.read(os.path.expanduser('~/.config/swh/lister-github.ini')) conf = config._sections['main'] # overrides if args.db_url: conf['db_url'] = args.db_url # typing if 'cache_json' in conf and conf['cache_json'].lower() == 'true': conf['cache_json'] = True else: conf['cache_json'] = False return conf if __name__ == '__main__': logging.basicConfig(level=logging.INFO) # XXX args = parse_args() conf = read_conf(args) db_engine, mk_session = db_connect(conf['db_url']) if args.action == 'createdb': models.SQLBase.metadata.create_all(db_engine) elif args.action == 'dropdb': models.SQLBase.metadata.drop_all(db_engine) elif args.action == 'list': lister.fetch(conf, mk_session, min_id=args.interval[0], max_id=args.interval[1]) elif args.action == 'catchup': with session_scope(mk_session) as db_session: last_known_id = lister.last_repo_id(db_session) if last_known_id is not None: logging.info('catching up from last known repo id: %d' % last_known_id) lister.fetch(conf, mk_session, min_id=last_known_id + 1, max_id=None) else: logging.error('Cannot catchup: no last known id found. Abort.') sys.exit(2) diff --git a/ghlister/__init__.py b/swh/lister/github/__init__.py similarity index 100% rename from ghlister/__init__.py rename to swh/lister/github/__init__.py diff --git a/ghlister/db_utils.py b/swh/lister/github/db_utils.py similarity index 100% rename from ghlister/db_utils.py rename to swh/lister/github/db_utils.py diff --git a/ghlister/lister.py b/swh/lister/github/lister.py similarity index 97% rename from ghlister/lister.py rename to swh/lister/github/lister.py index c4b58f1..9bbde6a 100644 --- a/ghlister/lister.py +++ b/swh/lister/github/lister.py @@ -1,161 +1,161 @@ # Copyright (C) 2015 Stefano Zacchiroli # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # see https://developer.github.com/v3/ for GitHub API documentation import gzip import logging import os import re import requests import time from pprint import pformat from sqlalchemy import func -from ghlister.db_utils import session_scope -from ghlister.models import Repository +from swh.lister.github.db_utils import session_scope +from swh.lister.github.models import Repository GH_API_URL = 'https://api.github.com' MAX_RETRIES = 7 MAX_SLEEP = 3600 # 1 hour CONN_SLEEP = 10 REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)') def save_http_response(r, cache_dir): def escape_url_path(p): return p.replace('/', '__') fname = os.path.join(cache_dir, escape_url_path(r.request.path_url) + '.gz') with gzip.open(fname, 'w') as f: def emit(s): f.write(bytes(s, 'UTF-8')) emit(pformat(r.request.path_url)) emit('\n#\n') emit(pformat(r.status_code)) emit('\n#\n') emit(pformat(r.headers)) emit('\n#\n') emit(pformat(r.json())) def gh_api_request(path, username=None, password=None, headers={}): params = {} if 'Accept' not in headers: # request version 3 of the API headers['Accept'] = 'application/vnd.github.v3+json' params['headers'] = headers if username is not None and password is not None: params['auth'] = (username, password) retries_left = MAX_RETRIES while retries_left > 0: logging.debug('sending API request: %s' % path) try: r = requests.get(GH_API_URL + path, **params) except requests.exceptions.ConnectionError: # network-level connection error, try again logging.warn('connection error upon %s: sleep for %d seconds' % (path, CONN_SLEEP)) time.sleep(CONN_SLEEP) retries_left -= 1 continue if r.ok: # all went well, do not retry break # detect throttling if r.status_code == 403 and \ int(r.headers['X-RateLimit-Remaining']) == 0: delay = int(r.headers['X-RateLimit-Reset']) - time.time() delay = min(delay, MAX_SLEEP) logging.warn('rate limited upon %s: sleep for %d seconds' % (path, int(delay))) time.sleep(delay) else: # unexpected error, abort break retries_left -= 1 if not retries_left: logging.warn('giving up on %s: max retries exceed' % path) return r def lookup_repo(db_session, repo_id): return db_session.query(Repository) \ .filter(Repository.id == repo_id) \ .first() def last_repo_id(db_session): t = db_session.query(func.max(Repository.id)) \ .first() if t is not None: return t[0] # else: return None INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description', 'fork'] def inject_repo(db_session, repo): logging.debug('injecting repo %d' % repo['id']) if lookup_repo(db_session, repo['id']): logging.info('not injecting already present repo %d' % repo['id']) return kwargs = {k: repo[k] for k in INJECT_KEYS if k in repo} sql_repo = Repository(**kwargs) db_session.add(sql_repo) class FetchError(RuntimeError): def __init__(self, response): self.response = response def __str__(self): return repr(self.response) def fetch(conf, mk_session, min_id=None, max_id=None): if min_id is None: min_id = 1 if max_id is None: max_id = float('inf') next_id = min_id cred = {} for key in ['username', 'password']: if key in conf: cred[key] = conf[key] while min_id <= next_id <= max_id: logging.info('listing repos starting at %d' % next_id) since = next_id - 1 # github API ?since=... is '>' strict, not '>=' repos_res = gh_api_request('/repositories?since=%d' % since, **cred) if 'cache_dir' in conf and conf['cache_json']: save_http_response(repos_res, conf['cache_dir']) if not repos_res.ok: raise FetchError(repos_res) repos = repos_res.json() for repo in repos: if repo['id'] > max_id: # do not overstep max_id break with session_scope(mk_session) as db_session: inject_repo(db_session, repo) if 'next' in repos_res.links: next_url = repos_res.links['next']['url'] m = REPO_API_URL_RE.match(next_url) # parse next_id next_id = int(m.group(1)) + 1 else: logging.info('stopping after id %d, no next link found' % next_id) break diff --git a/ghlister/models.py b/swh/lister/github/models.py similarity index 100% rename from ghlister/models.py rename to swh/lister/github/models.py