Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342070
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
View Options
diff --git a/bin/ghlister b/bin/ghlister
index aceeeb0..da4c5d9 100755
--- a/bin/ghlister
+++ b/bin/ghlister
@@ -1,135 +1,135 @@
#!/usr/bin/python3
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import argparse
import configparser
import logging
import os
import sys
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
-from ghlister import lister, models
-from ghlister.db_utils import session_scope
+from swh.lister.github import lister, models
+from swh.lister.github.db_utils import session_scope
DEFAULT_CONF = {
'cache_dir': './cache',
'log_dir': './log',
'cache_json': 'False',
}
def db_connect(db_url):
engine = create_engine(db_url)
session = sessionmaker(bind=engine)
return (engine, session)
def int_interval(s):
"""parse an "N-M" string as an interval.
Return an (N,M) int (or None) pair
"""
def not_an_interval():
raise argparse.ArgumentTypeError('not an interval: ' + s)
def parse_int(s):
if s:
return int(s)
else:
return None
if '-' not in s:
not_an_interval()
parts = s.split('-')
if len(parts) > 2:
not_an_interval()
return tuple([parse_int(p) for p in parts])
def parse_args():
cli = argparse.ArgumentParser(
description='list GitHub repositories and load them into a DB')
cli.add_argument('--db-url', '-d', metavar='SQLALCHEMY_URL',
help='SQLAlchemy DB URL (override conffile); see '
'<http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls>') # NOQA
cli.add_argument('--verbose', '-v', action='store_true',
help='be verbose')
subcli = cli.add_subparsers(dest='action')
subcli.add_parser('createdb', help='initialize DB')
subcli.add_parser('dropdb', help='destroy DB')
list_cli = subcli.add_parser('list', help='list repositories')
list_cli.add_argument('interval',
type=int_interval,
help='interval of repository IDs to list, '
'in N-M format; either N or M can be omitted.')
list_cli = subcli.add_parser('catchup',
help='catchup with new repos since last time')
args = cli.parse_args()
if not args.action:
cli.error('no action given')
return args
def read_conf(args):
config = configparser.ConfigParser(defaults=DEFAULT_CONF)
config.read(os.path.expanduser('~/.config/swh/lister-github.ini'))
conf = config._sections['main']
# overrides
if args.db_url:
conf['db_url'] = args.db_url
# typing
if 'cache_json' in conf and conf['cache_json'].lower() == 'true':
conf['cache_json'] = True
else:
conf['cache_json'] = False
return conf
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) # XXX
args = parse_args()
conf = read_conf(args)
db_engine, mk_session = db_connect(conf['db_url'])
if args.action == 'createdb':
models.SQLBase.metadata.create_all(db_engine)
elif args.action == 'dropdb':
models.SQLBase.metadata.drop_all(db_engine)
elif args.action == 'list':
lister.fetch(conf,
mk_session,
min_id=args.interval[0],
max_id=args.interval[1])
elif args.action == 'catchup':
with session_scope(mk_session) as db_session:
last_known_id = lister.last_repo_id(db_session)
if last_known_id is not None:
logging.info('catching up from last known repo id: %d' %
last_known_id)
lister.fetch(conf,
mk_session,
min_id=last_known_id + 1,
max_id=None)
else:
logging.error('Cannot catchup: no last known id found. Abort.')
sys.exit(2)
diff --git a/ghlister/__init__.py b/swh/lister/github/__init__.py
similarity index 100%
rename from ghlister/__init__.py
rename to swh/lister/github/__init__.py
diff --git a/ghlister/db_utils.py b/swh/lister/github/db_utils.py
similarity index 100%
rename from ghlister/db_utils.py
rename to swh/lister/github/db_utils.py
diff --git a/ghlister/lister.py b/swh/lister/github/lister.py
similarity index 97%
rename from ghlister/lister.py
rename to swh/lister/github/lister.py
index c4b58f1..9bbde6a 100644
--- a/ghlister/lister.py
+++ b/swh/lister/github/lister.py
@@ -1,161 +1,161 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# see https://developer.github.com/v3/ for GitHub API documentation
import gzip
import logging
import os
import re
import requests
import time
from pprint import pformat
from sqlalchemy import func
-from ghlister.db_utils import session_scope
-from ghlister.models import Repository
+from swh.lister.github.db_utils import session_scope
+from swh.lister.github.models import Repository
GH_API_URL = 'https://api.github.com'
MAX_RETRIES = 7
MAX_SLEEP = 3600 # 1 hour
CONN_SLEEP = 10
REPO_API_URL_RE = re.compile(r'^.*/repositories\?since=(\d+)')
def save_http_response(r, cache_dir):
def escape_url_path(p):
return p.replace('/', '__')
fname = os.path.join(cache_dir,
escape_url_path(r.request.path_url) + '.gz')
with gzip.open(fname, 'w') as f:
def emit(s):
f.write(bytes(s, 'UTF-8'))
emit(pformat(r.request.path_url))
emit('\n#\n')
emit(pformat(r.status_code))
emit('\n#\n')
emit(pformat(r.headers))
emit('\n#\n')
emit(pformat(r.json()))
def gh_api_request(path, username=None, password=None, headers={}):
params = {}
if 'Accept' not in headers: # request version 3 of the API
headers['Accept'] = 'application/vnd.github.v3+json'
params['headers'] = headers
if username is not None and password is not None:
params['auth'] = (username, password)
retries_left = MAX_RETRIES
while retries_left > 0:
logging.debug('sending API request: %s' % path)
try:
r = requests.get(GH_API_URL + path, **params)
except requests.exceptions.ConnectionError:
# network-level connection error, try again
logging.warn('connection error upon %s: sleep for %d seconds' %
(path, CONN_SLEEP))
time.sleep(CONN_SLEEP)
retries_left -= 1
continue
if r.ok: # all went well, do not retry
break
# detect throttling
if r.status_code == 403 and \
int(r.headers['X-RateLimit-Remaining']) == 0:
delay = int(r.headers['X-RateLimit-Reset']) - time.time()
delay = min(delay, MAX_SLEEP)
logging.warn('rate limited upon %s: sleep for %d seconds' %
(path, int(delay)))
time.sleep(delay)
else: # unexpected error, abort
break
retries_left -= 1
if not retries_left:
logging.warn('giving up on %s: max retries exceed' % path)
return r
def lookup_repo(db_session, repo_id):
return db_session.query(Repository) \
.filter(Repository.id == repo_id) \
.first()
def last_repo_id(db_session):
t = db_session.query(func.max(Repository.id)) \
.first()
if t is not None:
return t[0]
# else: return None
INJECT_KEYS = ['id', 'name', 'full_name', 'html_url', 'description', 'fork']
def inject_repo(db_session, repo):
logging.debug('injecting repo %d' % repo['id'])
if lookup_repo(db_session, repo['id']):
logging.info('not injecting already present repo %d' % repo['id'])
return
kwargs = {k: repo[k] for k in INJECT_KEYS if k in repo}
sql_repo = Repository(**kwargs)
db_session.add(sql_repo)
class FetchError(RuntimeError):
def __init__(self, response):
self.response = response
def __str__(self):
return repr(self.response)
def fetch(conf, mk_session, min_id=None, max_id=None):
if min_id is None:
min_id = 1
if max_id is None:
max_id = float('inf')
next_id = min_id
cred = {}
for key in ['username', 'password']:
if key in conf:
cred[key] = conf[key]
while min_id <= next_id <= max_id:
logging.info('listing repos starting at %d' % next_id)
since = next_id - 1 # github API ?since=... is '>' strict, not '>='
repos_res = gh_api_request('/repositories?since=%d' % since, **cred)
if 'cache_dir' in conf and conf['cache_json']:
save_http_response(repos_res, conf['cache_dir'])
if not repos_res.ok:
raise FetchError(repos_res)
repos = repos_res.json()
for repo in repos:
if repo['id'] > max_id: # do not overstep max_id
break
with session_scope(mk_session) as db_session:
inject_repo(db_session, repo)
if 'next' in repos_res.links:
next_url = repos_res.links['next']['url']
m = REPO_API_URL_RE.match(next_url) # parse next_id
next_id = int(m.group(1)) + 1
else:
logging.info('stopping after id %d, no next link found' % next_id)
break
diff --git a/ghlister/models.py b/swh/lister/github/models.py
similarity index 100%
rename from ghlister/models.py
rename to swh/lister/github/models.py
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:28 PM (2 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3253533
Attached To
rDLS Listers
Event Timeline
Log In to Comment