Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/README.md b/README.md
index 21bae5d..7d2bade 100644
--- a/README.md
+++ b/README.md
@@ -1,120 +1,120 @@
SWH-lister
============
The Software Heritage Lister is both a library module to permit to
centralize lister behaviors, and to provide lister implementations.
Actual lister implementations are:
- swh-lister-debian
- swh-lister-github
- swh-lister-gitlab
- swh-lister-bitbucket
Licensing
----------
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
See top-level LICENSE file for the full text of the GNU General Public License
along with this program.
Dependencies
------------
- python3
- python3-requests
- python3-sqlalchemy
More details in requirements*.txt
Local deployment
-----------
## lister-github
### Preparation steps
1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing)
2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/
3. create configuration file ~/.config/swh/lister-github.com.yml
4. Bootstrap the db instance schema
$ createdb lister-github
$ python3 -m swh.lister.cli --db-url postgres:///lister-github \
--lister github \
--create-tables
### Configuration file sample
Minimalistic configuration:
$ cat ~/.config/swh/lister-github.com.yml
# see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
lister_db_url: postgres:///lister-github
credentials: []
cache_responses: True
cache_dir: /home/zack/.cache/swh/lister/github.com
Note: This expects storage (5002) and scheduler (5008) services to run locally
### Run
$ python3
>>> import logging
>>> logging.basicConfig(level=logging.DEBUG)
>>> from swh.lister.github.tasks import RangeGitHubLister; RangeGitHubLister().run(364, 365)
INFO:root:listing repos starting at 364
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com
DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost
DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1
## lister-gitlab
### preparation steps
1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing)
2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/
3. create configuration file ~/.config/swh/lister-gitlab.yml
4. Bootstrap the db instance schema
$ createdb lister-gitlab
$ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \
--lister gitlab \
--create-tables
### Configuration file sample
$ cat ~/.config/swh/lister-gitlab.yml
# see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
lister_db_url: postgres:///lister-gitlab
credentials: []
cache_responses: True
cache_dir: /home/zack/.cache/swh/lister/gitlab
Note: This expects storage (5002) and scheduler (5008) services to run locally
### Run
$ python3
Python 3.6.6 (default, Jun 27 2018, 14:44:17)
[GCC 8.1.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2,
- {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc'})
+ {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20})
>>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task(
- {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc'})
+ {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20})
>>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task(
{'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4',
- 'sort': 'asc'})
+ 'sort': 'asc', 'per_page': 20})
diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py
index 654cfc3..d24d773 100644
--- a/swh/lister/gitlab/lister.py
+++ b/swh/lister/gitlab/lister.py
@@ -1,119 +1,122 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
import time
from ..core.page_by_page_lister import PageByPageHttpLister
from .models import GitLabModel
class GitLabLister(PageByPageHttpLister):
# Template path expecting an integer that represents the page id
PATH_TEMPLATE = '/projects?page=%d&order_by=id'
MODEL = GitLabModel
LISTER_NAME = 'gitlab'
def __init__(self, api_baseurl=None, instance=None,
- override_config=None, sort='asc'):
+ override_config=None, sort='asc', per_page=20):
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
self.instance = instance
self.PATH_TEMPLATE = '%s&sort=%s' % (self.PATH_TEMPLATE, sort)
+ if per_page != 20:
+ self.PATH_TEMPLATE = '%s&per_page=%s' % (
+ self.PATH_TEMPLATE, per_page)
@property
def ADDITIONAL_CONFIG(self):
"""Override additional config as the 'credentials' structure change
between the ancestor classes and this class.
cf. request_params method below
"""
default_config = super().ADDITIONAL_CONFIG
# 'credentials' is a dict of (instance, {username, password}) dict
default_config['credentials'] = ('dict', {})
return default_config
def request_params(self, identifier):
"""Get the full parameters passed to requests given the
transport_request identifier.
For the gitlab lister, the 'credentials' entries is configured
per instance. For example:
- credentials:
- gitlab.com:
- username: user0
password: <pass>
- username: user1
password: <pass>
- ...
- other-gitlab-instance:
...
"""
params = {
'headers': self.request_headers() or {}
}
# Retrieve the credentials per instance
creds = self.config['credentials']
if creds:
creds_lister = creds[self.instance]
auth = random.choice(creds_lister) if creds else None
if auth:
params['auth'] = (auth['username'], auth['password'])
return params
def uid(self, repo):
return '%s/%s' % (self.instance, repo['path_with_namespace'])
def get_model_from_repo(self, repo):
return {
'instance': self.instance,
'uid': self.uid(repo),
'name': repo['name'],
'full_name': repo['path_with_namespace'],
'html_url': repo['web_url'],
'origin_url': repo['http_url_to_repo'],
'origin_type': 'git',
'description': repo['description'],
}
def transport_quota_check(self, response):
"""Deal with rate limit if any.
"""
# not all gitlab instance have rate limit
if 'RateLimit-Remaining' in response.headers:
reqs_remaining = int(response.headers['RateLimit-Remaining'])
if response.status_code == 403 and reqs_remaining == 0:
reset_at = int(response.headers['RateLimit-Reset'])
delay = min(reset_at - time.time(), 3600)
return True, delay
return False, 0
def _get_int(self, headers, key):
_val = headers.get(key)
if _val:
return int(_val)
def get_next_target_from_response(self, response):
"""Determine the next page identifier.
"""
return self._get_int(response.headers, 'x-next-page')
def get_pages_information(self):
"""Determine pages information.
"""
response = self.transport_head(identifier=1)
h = response.headers
return (self._get_int(h, 'x-total'),
self._get_int(h, 'x-total-pages'),
self._get_int(h, 'x-per-page'))
def transport_response_simplified(self, response):
repos = response.json()
return [self.get_model_from_repo(repo) for repo in repos]
diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py
index 84675a1..9adcf12 100644
--- a/swh/lister/gitlab/tasks.py
+++ b/swh/lister/gitlab/tasks.py
@@ -1,63 +1,63 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
from celery import group
from .. import utils
from ..core.tasks import ListerTaskBase, RangeListerTask
from .lister import GitLabLister
class GitLabListerTask(ListerTaskBase):
def new_lister(self, *, api_baseurl='https://gitlab.com/api/v4',
- instance='gitlab', sort='asc'):
+ instance='gitlab', sort='asc', per_page=20):
return GitLabLister(
api_baseurl=api_baseurl, instance=instance, sort=sort)
class RangeGitLabLister(GitLabListerTask, RangeListerTask):
"""Range GitLab lister (list available origins on specified range)
"""
task_queue = 'swh_lister_gitlab_refresh'
class FullGitLabRelister(GitLabListerTask):
"""Full GitLab lister (list all available origins from the api).
"""
task_queue = 'swh_lister_gitlab_refresh'
# nb pages
nb_pages = 10
def run_task(self, lister_args=None):
if lister_args is None:
lister_args = {}
lister = self.new_lister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, self.nb_pages))
random.shuffle(ranges)
range_task = RangeGitLabLister()
group(range_task.s(minv, maxv, lister_args=lister_args)
for minv, maxv in ranges)()
class IncrementalGitLabLister(GitLabListerTask):
"""Incremental GitLab lister (list only new available origins).
"""
task_queue = 'swh_lister_gitlab_discover'
def run_task(self, lister_args=None):
if lister_args is None:
lister_args = {}
lister_args['sort'] = 'desc'
lister = self.new_lister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
# stopping as soon as existing origins for that instance are detected
return lister.run(min_bound=1, max_bound=total_pages,
check_existence=True)

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 2:34 PM (4 d, 10 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3253688

Event Timeline