diff --git a/README b/README deleted file mode 100644 --- a/README +++ /dev/null @@ -1,47 +0,0 @@ - -Licensing -========= - -This program is free software: you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation, either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -See top-level LICENSE file for the full text of the GNU General Public License -along with this program. - - -Dependencies -============ - -- python3 -- python3-psycopg2 -- python3-requests -- python3-sqlalchemy - - -Deployment -========== - -1. git clone under $GHLISTER_ROOT (of your choosing) -2. mkdir ~/.config/swh/ ~/.cache/swh/lister-github/ -3. edit $GHLISTER_ROOT/etc/crontab and customize GHLISTER_ROOT -4. crontab $GHLISTER_ROOT/etc/crontab -5. create configuration file ~/.config/swh/lister-github.ini - -Sample configuration file -------------------------- - -cat ~/.config/swh/lister-github.ini - - [main] - db_url = postgres:///github - # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls - cache_dir = /home/zack/.cache/swh/lister-github - log_dir = /home/zack/.cache/swh/lister-github - username = foobar # github username - password = quux # github password diff --git a/README.md b/README.md new file mode 100644 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +SWH-lister +============ + +The Software Heritage Lister is both a library module to permit to +centralize lister behaviors, and to provide lister implementations. + +Actual lister implementations are: + +- swh-lister-debian +- swh-lister-github +- swh-lister-bitbucket + +Licensing +---------- + +This program is free software: you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +See top-level LICENSE file for the full text of the GNU General Public License +along with this program. + + +Dependencies +------------ + +- python3 +- python3-requests +- python3-sqlalchemy + +More details in requirements*.txt + + +Local deployment +----------- + +## lister-github + +1. git clone under $GHLISTER_ROOT (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ +3. create configuration file ~/.config/swh/lister-github.com.yml +4. Bootstrap the db instance schema + +``` sh +$ createdb lister-github.com +$ bin/ghlister --db-url postgres:///lister-github.com createdb +``` + +Configuration file samples +------------------------- + +## github + +cat ~/.config/swh/lister-github.com.yml + + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-github.com + credentials: [] + cache_responses: True + cache_dir: /home/zack/.cache/swh/lister/github.com + + storage: + cls: remote + args: + url: http://localhost:5002/ diff --git a/TODO b/TODO deleted file mode 100644 --- a/TODO +++ /dev/null @@ -1,57 +0,0 @@ -# -*- mode: org -*- - -* TODO SQL: rework repo_history/repo_creations to use last_seen -* TODO cache dir: split json data from other HTTP info - for easier further processing of additional API data - -* TODO cache dir: split in subdirs - to avoid hitting too hard on the filesystem due to the large amount of files - (200k+) - -* TODO network-level traceback - Traceback (most recent call last): - File "/usr/lib/python3/dist-packages/urllib3/response.py", line 186, in read - data = self._fp.read(amt) - File "/usr/lib/python3.4/http/client.py", line 500, in read - return super(HTTPResponse, self).read(amt) - File "/usr/lib/python3.4/http/client.py", line 529, in readinto - return self._readinto_chunked(b) - File "/usr/lib/python3.4/http/client.py", line 621, in _readinto_chunked - n = self._safe_readinto(mvb) - File "/usr/lib/python3.4/http/client.py", line 680, in _safe_readinto - raise IncompleteRead(bytes(mvb[0:total_bytes]), len(b)) - http.client.IncompleteRead: IncompleteRead(3201 bytes read, 10240 more expected) - - During handling of the above exception, another exception occurred: - - Traceback (most recent call last): - File "/usr/lib/python3/dist-packages/requests/models.py", line 653, in generate - for chunk in self.raw.stream(chunk_size, decode_content=True): - File "/usr/lib/python3/dist-packages/urllib3/response.py", line 256, in stream - data = self.read(amt=amt, decode_content=decode_content) - File "/usr/lib/python3/dist-packages/urllib3/response.py", line 214, in read - raise ProtocolError('Connection broken: %r' % e, e) - urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead(3201 bytes read, 10240 more expected)', IncompleteRead(3201 bytes read, 10240 more expected)) - - During handling of the above exception, another exception occurred: - - Traceback (most recent call last): - File "bin/ghlister", line 110, in - max_id=args.interval[1]) - File "/home/zack/dati/projects/github-list-repo/ghlister/lister.py", line 129, in fetch - repos_res = gh_api_request('/repositories?since=%d' % since, **cred) - File "/home/zack/dati/projects/github-list-repo/ghlister/lister.py", line 55, in gh_api_request - r = requests.get(GH_API_URL + path, **params) - File "/usr/lib/python3/dist-packages/requests/api.py", line 60, in get - return request('get', url, **kwargs) - File "/usr/lib/python3/dist-packages/requests/api.py", line 49, in request - return session.request(method=method, url=url, **kwargs) - File "/usr/lib/python3/dist-packages/requests/sessions.py", line 457, in request - resp = self.send(prep, **send_kwargs) - File "/usr/lib/python3/dist-packages/requests/sessions.py", line 606, in send - r.content - File "/usr/lib/python3/dist-packages/requests/models.py", line 724, in content - self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes() - File "/usr/lib/python3/dist-packages/requests/models.py", line 656, in generate - raise ChunkedEncodingError(e) - requests.exceptions.ChunkedEncodingError: ('Connection broken: IncompleteRead(3201 bytes read, 10240 more expected)', IncompleteRead(3201 bytes read, 10240 more expected)) diff --git a/bin/ghlister b/bin/ghlister --- a/bin/ghlister +++ b/bin/ghlister @@ -80,12 +80,14 @@ if args.db_url: override_conf['lister_db_url'] = args.db_url - lister = GitHubLister(override_conf) + lister = GitHubLister(lister_name='github.com', + api_baseurl='https://api.github.com', + override_config=override_conf) if args.action == 'createdb': - models.SQLBase.metadata.create_all(lister.db_engine) + models.ModelBase.metadata.create_all(lister.db_engine) elif args.action == 'dropdb': - models.SQLBase.metadata.drop_all(lister.db_engine) + models.ModelBase.metadata.drop_all(lister.db_engine) elif args.action == 'list': lister.fetch(min_id=args.interval[0], max_id=args.interval[1]) diff --git a/docs/tutorial.rst b/docs/tutorial.rst --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -167,15 +167,15 @@ # Copyright (C) 2017 the Software Heritage developers # License: GNU General Public License version 3 or later # See top-level LICENSE file for more information - + from urllib import parse from swh.lister.bitbucket.models import BitBucketModel from swh.lister.core.indexing_lister import SWHIndexingHttpLister - + class BitBucketLister(SWHIndexingHttpLister): PATH_TEMPLATE = '/repositories?after=%s' MODEL = BitBucketModel - + def get_model_from_repo(self, repo): return {'uid': repo['uuid'], 'indexable': repo['created_on'], @@ -185,14 +185,14 @@ 'origin_url': repo['links']['clone'][0]['href'], 'origin_type': repo['scm'], 'description': repo['description']} - + def get_next_target_from_response(self, response): body = response.json() if 'next' in body: return parse.unquote(body['next'].split('after=')[1]) else: return None - + def transport_response_simplified(self, response): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] @@ -268,13 +268,14 @@ * GitHub sends the next URL as part of the response header, while BitBucket sends it in the response body. -* GitHub differentiates API versions with a request header (our HTTP transport -mix-in will automatically use any headers provided by an optional -request_headers method that we implement here), while BitBucket has it as part -of their base service URL. BitBucket uses the IETF standard HTTP 429 response -code for their rate limit notifications (the HTTP transport mix-in -automatically handles that), while GitHub uses their own custom response -headers that need special treatment. +* GitHub differentiates API versions with a request header (our HTTP + transport mix-in will automatically use any headers provided by an + optional request_headers method that we implement here), while + BitBucket has it as part of their base service URL. BitBucket uses + the IETF standard HTTP 429 response code for their rate limit + notifications (the HTTP transport mix-in automatically handles + that), while GitHub uses their own custom response headers that need + special treatment. * But look at them! 58 lines of Python code, combined, to absorb all repositories from two of the largest and most influential source code hosting @@ -291,7 +292,7 @@ # main task - ghl = GitHubLister(lister_name='github.com', + ghl = GitHubLister(lister_name='github.com', api_baseurl='https://github.com') ghl.run() @@ -300,7 +301,7 @@ # SWHIndexingLister.run identifier = None - do + do response, repos = SWHListerBase.ingest_data(identifier) identifier = GitHubLister.get_next_target_from_response(response) while(identifier) @@ -330,7 +331,7 @@ # SWHListerHttpTransport.transport_request - path = SWHListerBase.api_baseurl + path = SWHListerBase.api_baseurl + SWHListerHttpTransport.PATH_TEMPLATE % identifier headers = SWHListerHttpTransport.request_headers() return http.get(path, headers) diff --git a/etc/crontab b/etc/crontab deleted file mode 100644 --- a/etc/crontab +++ /dev/null @@ -1,5 +0,0 @@ -SHELL=/bin/bash -GHLISTER_ROOT=/home/zack/src/swh-lister-github - -# m h dom mon dow command - 0 8 * * * PYTHONPATH=$GHLISTER_ROOT $GHLISTER_ROOT/bin/ghlister catchup >> ~/.cache/swh/lister-github/$(date +\%Y\%m\%d).log 2>&1 diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,15 +14,15 @@ def get_model_from_repo(self, repo): return { - 'uid': repo['uuid'], - 'indexable': repo['created_on'], - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['links']['html']['href'], - 'origin_url': repo['links']['clone'][0]['href'], - 'origin_type': repo['scm'], - 'description': repo['description'] - } + 'uid': repo['uuid'], + 'indexable': repo['created_on'], + 'name': repo['name'], + 'full_name': repo['full_name'], + 'html_url': repo['links']['html']['href'], + 'origin_url': repo['links']['clone'][0]['href'], + 'origin_type': repo['scm'], + 'description': repo['description'] + } def get_next_target_from_response(self, response): body = response.json() diff --git a/swh/lister/cli.py b/swh/lister/cli.py new file mode 100644 --- /dev/null +++ b/swh/lister/cli.py @@ -0,0 +1,98 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click + + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.option( + '--db-url', '-d', default='postgres:///lister-gitlab.com', + help='SQLAlchemy DB URL; see ' + '') # noqa +@click.pass_context +def cli(ctx, db_url): + """Initialize db model according to lister. + + """ + config = {} + if db_url: + config['db_url'] = db_url + ctx.obj = config + + +@cli.command('github') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def github(ctx, createdb, dropdb): + from .github import models + from .github.lister import GitHubLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = GitHubLister(lister_name='github.com', + api_baseurl='https://api.github.com', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +@cli.command('gitlab') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def gitlab(ctx, createdb, dropdb): + from .gitlab import models + from .gitlab.lister import GitlabLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = GitlabLister(lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4/', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +@cli.command('bitbucket') +@click.option('--createdb', is_flag=True, default=False, + help='create db') +@click.option('--dropdb', is_flag=True, default=False, + help='Drop db') +@click.pass_context +def bitbucket(ctx, createdb, dropdb): + from .bitbucket import models + from .bitbucket.lister import BitBucketLister + + override_conf = {'lister_db_url': ctx.obj['db_url']} + + lister = BitBucketLister(lister_name='bitbucket.com', + api_baseurl='https://api.bitbucket.org/2.0', + override_config=override_conf) + + if dropdb: + models.ModelBase.metadata.drop_all(lister.db_engine) + + if createdb: + models.ModelBase.metadata.create_all(lister.db_engine) + + +if __name__ == '__main__': + cli() diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py --- a/swh/lister/core/lister_base.py +++ b/swh/lister/core/lister_base.py @@ -134,9 +134,9 @@ pass def filter_before_inject(self, models_list): - """Function run after transport_response_simplified but before injection - into the local db and creation of workers. Can be used to eliminate - some of the results if necessary. + """Function run after transport_response_simplified but before + injection into the local db and creation of workers. Can be + used to eliminate some of the results if necessary. MAY BE OVERRIDDEN if an intermediate Lister class needs to filter results before injection without requiring every child class to do so. diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -55,11 +55,12 @@ return self.api_baseurl + path def request_params(self, identifier): - """Get the full parameters passed to requests given the transport_request - identifier. + """Get the full parameters passed to requests given the + transport_request identifier. MAY BE OVERRIDDEN if something more complex than the request headers - ois needed. + is needed. + """ params = {} params['headers'] = self.request_headers() or {} @@ -70,12 +71,13 @@ return params def transport_quota_check(self, response): - """Implements SWHListerBase.transport_quota_check with standard 429 code - check for HTTP with Requests library. + """Implements SWHListerBase.transport_quota_check with standard 429 + code check for HTTP with Requests library. MAY BE OVERRIDDEN if the server notifies about rate limits in a non-standard way that doesn't use HTTP 429 and the Retry-After response header. ( https://tools.ietf.org/html/rfc6585#section-4 ) + """ if response.status_code == 429: # HTTP too many requests retry_after = response.headers.get('Retry-After', self.back_off()) diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,16 +16,16 @@ def get_model_from_repo(self, repo): return { - 'uid': repo['id'], - 'indexable': repo['id'], - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['html_url'], - 'origin_url': repo['html_url'], - 'origin_type': 'git', - 'description': repo['description'], - 'fork': repo['fork'], - } + 'uid': repo['id'], + 'indexable': repo['id'], + 'name': repo['name'], + 'full_name': repo['full_name'], + 'html_url': repo['html_url'], + 'origin_url': repo['html_url'], + 'origin_type': 'git', + 'description': repo['description'], + 'fork': repo['fork'], + } def transport_quota_check(self, response): reqs_remaining = int(response.headers['X-RateLimit-Remaining']) diff --git a/swh/lister/gitlab/__init__.py b/swh/lister/gitlab/__init__.py new file mode 100644 diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/lister.py @@ -0,0 +1,71 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +import time + +from ..core.indexing_lister import SWHIndexingHttpLister +from .models import GitlabModel + + +class GitlabLister(SWHIndexingHttpLister): + # Path to give and mentioning the last id for the next page + PATH_TEMPLATE = '/projects?page=%d' + # gitlab api do not have an indexable identifier so using the page + # id + API_URL_INDEX_RE = re.compile(r'^.*/projects.*\&page=(\d+).*') + # The indexable field, the one we are supposed to use in the api + # query is not part of the lookup query. So, we cannot filter + # (method filter_before_inject), nor detect and disable origins + # (method disable_deleted_repo_tasks) + MODEL = GitlabModel + + def filter_before_inject(self, models_list): + """We cannot filter so returns the models_list as is. + + """ + return models_list + + def get_model_from_repo(self, repo): + return { + 'uid': repo['id'], + 'indexable': repo['id'], + 'name': repo['name'], + 'full_name': repo['path_with_namespace'], + 'html_url': repo['web_url'], + 'origin_url': repo['http_url_to_repo'], + 'origin_type': 'git', + 'description': repo['description'], + # FIXME: How to determine the fork nature? Do we need that + # information? Variable `repo` holds a `count_fork` key + # which is the number of forks for that + # repository. Default to False for now. + 'fork': False, + } + + def transport_quota_check(self, response): + """Deal with rate limit + + """ + reqs_remaining = int(response.headers['RateLimit-Remaining']) + # TODO: need to dig further about the actual returned code + # (not seen yet in documentation) + if response.status_code == 403 and reqs_remaining == 0: + reset_at = int(response.headers['RateLimit-Reset']) + delay = min(reset_at - time.time(), 3600) + return True, delay + return False, 0 + + def get_next_target_from_response(self, response): + """Deal with pagination + + """ + if 'next' in response.links: + next_url = response.links['next']['url'] + return int(self.API_URL_INDEX_RE.match(next_url).group(1)) + return None + + def transport_response_simplified(self, response): + repos = response.json() + return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/gitlab/models.py b/swh/lister/gitlab/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/models.py @@ -0,0 +1,20 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, Boolean, Integer + +from ..core.models import ModelBase + + +class GitlabModel(ModelBase): + """a Gitlab repository""" + __tablename__ = 'main_gitlab_repos' + + uid = Column(Integer, primary_key=True) + indexable = Column(Integer, index=True) + fork = Column(Boolean) + + def __init__(self, *args, **kwargs): + self.fork = kwargs.pop('fork', False) + super().__init__(*args, **kwargs) diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/gitlab/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.core.tasks import (IndexingDiscoveryListerTask, + IndexingRangeListerTask, + IndexingRefreshListerTask, ListerTaskBase) + +from .lister import GitlabLister + + +class GitlabDotComListerTask(ListerTaskBase): + def new_lister(self): + return GitlabLister(lister_name='gitlab.com', + api_baseurl='https://gitlab.com/api/v4') + + +class IncrementalGitlabDotComLister(GitlabDotComListerTask, + IndexingDiscoveryListerTask): + task_queue = 'swh_lister_gitlab_discover' + + +class RangeGitlabLister(GitlabDotComListerTask, IndexingRangeListerTask): + task_queue = 'swh_lister_gitlab_refresh' + + +class FullGitlabRelister(GitlabDotComListerTask, IndexingRefreshListerTask): + task_queue = 'swh_lister_gitlab_refresh'