Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9344567
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
View Options
diff --git a/README.md b/README.md
index 21bae5d..7d2bade 100644
--- a/README.md
+++ b/README.md
@@ -1,120 +1,120 @@
SWH-lister
============
The Software Heritage Lister is both a library module to permit to
centralize lister behaviors, and to provide lister implementations.
Actual lister implementations are:
- swh-lister-debian
- swh-lister-github
- swh-lister-gitlab
- swh-lister-bitbucket
Licensing
----------
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
See top-level LICENSE file for the full text of the GNU General Public License
along with this program.
Dependencies
------------
- python3
- python3-requests
- python3-sqlalchemy
More details in requirements*.txt
Local deployment
-----------
## lister-github
### Preparation steps
1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing)
2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/
3. create configuration file ~/.config/swh/lister-github.com.yml
4. Bootstrap the db instance schema
$ createdb lister-github
$ python3 -m swh.lister.cli --db-url postgres:///lister-github \
--lister github \
--create-tables
### Configuration file sample
Minimalistic configuration:
$ cat ~/.config/swh/lister-github.com.yml
# see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
lister_db_url: postgres:///lister-github
credentials: []
cache_responses: True
cache_dir: /home/zack/.cache/swh/lister/github.com
Note: This expects storage (5002) and scheduler (5008) services to run locally
### Run
$ python3
>>> import logging
>>> logging.basicConfig(level=logging.DEBUG)
>>> from swh.lister.github.tasks import RangeGitHubLister; RangeGitHubLister().run(364, 365)
INFO:root:listing repos starting at 364
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com
DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost
DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1
## lister-gitlab
### preparation steps
1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing)
2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/
3. create configuration file ~/.config/swh/lister-gitlab.yml
4. Bootstrap the db instance schema
$ createdb lister-gitlab
$ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \
--lister gitlab \
--create-tables
### Configuration file sample
$ cat ~/.config/swh/lister-gitlab.yml
# see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
lister_db_url: postgres:///lister-gitlab
credentials: []
cache_responses: True
cache_dir: /home/zack/.cache/swh/lister/gitlab
Note: This expects storage (5002) and scheduler (5008) services to run locally
### Run
$ python3
Python 3.6.6 (default, Jun 27 2018, 14:44:17)
[GCC 8.1.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2,
- {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc'})
+ {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20})
>>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task(
- {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc'})
+ {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20})
>>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task(
{'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4',
- 'sort': 'asc'})
+ 'sort': 'asc', 'per_page': 20})
diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py
index 654cfc3..d24d773 100644
--- a/swh/lister/gitlab/lister.py
+++ b/swh/lister/gitlab/lister.py
@@ -1,119 +1,122 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
import time
from ..core.page_by_page_lister import PageByPageHttpLister
from .models import GitLabModel
class GitLabLister(PageByPageHttpLister):
# Template path expecting an integer that represents the page id
PATH_TEMPLATE = '/projects?page=%d&order_by=id'
MODEL = GitLabModel
LISTER_NAME = 'gitlab'
def __init__(self, api_baseurl=None, instance=None,
- override_config=None, sort='asc'):
+ override_config=None, sort='asc', per_page=20):
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
self.instance = instance
self.PATH_TEMPLATE = '%s&sort=%s' % (self.PATH_TEMPLATE, sort)
+ if per_page != 20:
+ self.PATH_TEMPLATE = '%s&per_page=%s' % (
+ self.PATH_TEMPLATE, per_page)
@property
def ADDITIONAL_CONFIG(self):
"""Override additional config as the 'credentials' structure change
between the ancestor classes and this class.
cf. request_params method below
"""
default_config = super().ADDITIONAL_CONFIG
# 'credentials' is a dict of (instance, {username, password}) dict
default_config['credentials'] = ('dict', {})
return default_config
def request_params(self, identifier):
"""Get the full parameters passed to requests given the
transport_request identifier.
For the gitlab lister, the 'credentials' entries is configured
per instance. For example:
- credentials:
- gitlab.com:
- username: user0
password: <pass>
- username: user1
password: <pass>
- ...
- other-gitlab-instance:
...
"""
params = {
'headers': self.request_headers() or {}
}
# Retrieve the credentials per instance
creds = self.config['credentials']
if creds:
creds_lister = creds[self.instance]
auth = random.choice(creds_lister) if creds else None
if auth:
params['auth'] = (auth['username'], auth['password'])
return params
def uid(self, repo):
return '%s/%s' % (self.instance, repo['path_with_namespace'])
def get_model_from_repo(self, repo):
return {
'instance': self.instance,
'uid': self.uid(repo),
'name': repo['name'],
'full_name': repo['path_with_namespace'],
'html_url': repo['web_url'],
'origin_url': repo['http_url_to_repo'],
'origin_type': 'git',
'description': repo['description'],
}
def transport_quota_check(self, response):
"""Deal with rate limit if any.
"""
# not all gitlab instance have rate limit
if 'RateLimit-Remaining' in response.headers:
reqs_remaining = int(response.headers['RateLimit-Remaining'])
if response.status_code == 403 and reqs_remaining == 0:
reset_at = int(response.headers['RateLimit-Reset'])
delay = min(reset_at - time.time(), 3600)
return True, delay
return False, 0
def _get_int(self, headers, key):
_val = headers.get(key)
if _val:
return int(_val)
def get_next_target_from_response(self, response):
"""Determine the next page identifier.
"""
return self._get_int(response.headers, 'x-next-page')
def get_pages_information(self):
"""Determine pages information.
"""
response = self.transport_head(identifier=1)
h = response.headers
return (self._get_int(h, 'x-total'),
self._get_int(h, 'x-total-pages'),
self._get_int(h, 'x-per-page'))
def transport_response_simplified(self, response):
repos = response.json()
return [self.get_model_from_repo(repo) for repo in repos]
diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py
index 84675a1..9adcf12 100644
--- a/swh/lister/gitlab/tasks.py
+++ b/swh/lister/gitlab/tasks.py
@@ -1,63 +1,63 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
from celery import group
from .. import utils
from ..core.tasks import ListerTaskBase, RangeListerTask
from .lister import GitLabLister
class GitLabListerTask(ListerTaskBase):
def new_lister(self, *, api_baseurl='https://gitlab.com/api/v4',
- instance='gitlab', sort='asc'):
+ instance='gitlab', sort='asc', per_page=20):
return GitLabLister(
api_baseurl=api_baseurl, instance=instance, sort=sort)
class RangeGitLabLister(GitLabListerTask, RangeListerTask):
"""Range GitLab lister (list available origins on specified range)
"""
task_queue = 'swh_lister_gitlab_refresh'
class FullGitLabRelister(GitLabListerTask):
"""Full GitLab lister (list all available origins from the api).
"""
task_queue = 'swh_lister_gitlab_refresh'
# nb pages
nb_pages = 10
def run_task(self, lister_args=None):
if lister_args is None:
lister_args = {}
lister = self.new_lister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, self.nb_pages))
random.shuffle(ranges)
range_task = RangeGitLabLister()
group(range_task.s(minv, maxv, lister_args=lister_args)
for minv, maxv in ranges)()
class IncrementalGitLabLister(GitLabListerTask):
"""Incremental GitLab lister (list only new available origins).
"""
task_queue = 'swh_lister_gitlab_discover'
def run_task(self, lister_args=None):
if lister_args is None:
lister_args = {}
lister_args['sort'] = 'desc'
lister = self.new_lister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
# stopping as soon as existing origins for that instance are detected
return lister.run(min_bound=1, max_bound=total_pages,
check_existence=True)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 2:34 PM (4 d, 10 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3253688
Attached To
rDLS Listers
Event Timeline
Log In to Comment