diff --git a/README.md b/README.md index ff18dc5..96b7571 100644 --- a/README.md +++ b/README.md @@ -1,124 +1,122 @@ SWH-lister ============ The Software Heritage Lister is both a library module to permit to centralize lister behaviors, and to provide lister implementations. Actual lister implementations are: - swh-lister-debian - swh-lister-github - swh-lister-bitbucket Licensing ---------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ - python3 - python3-requests - python3-sqlalchemy More details in requirements*.txt Local deployment ----------- ## lister-github ### Preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema $ createdb lister-github $ python3 -m swh.lister.cli --db-url postgres:///lister-github \ --lister github \ --create-tables ### Configuration file sample $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-github credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/github.com storage: cls: remote args: url: http://localhost:5002/ ### Run $ python3 >>> import logging >>> logging.basicConfig(level=logging.DEBUG) >>> from swh.lister.github.tasks import RangeGitHubLister >>> RangeGitHubLister().run(364, 365) INFO:root:listing repos starting at 364 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 ## lister-gitlab ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ 3. create configuration file ~/.config/swh/lister-gitlab.yml 4. Bootstrap the db instance schema $ createdb lister-gitlab $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \ --lister gitlab \ --create-tables ### Configuration file sample $ cat ~/.config/swh/lister-gitlab.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-gitlab credentials: [] cache_responses: True cache_dir: /home/zack/.cache/swh/lister/gitlab storage: cls: remote args: url: http://localhost:5002/ ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, - instance='salsa.debian.org', api_baseurl='https://salsa.debian.org/api/v4') - >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, - instance='gitlab.freedesktop.org', api_baseurl='https://gitlab.freedesktop.org/api/v4') - >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, - instance='gitlab.gnome.org', api_baseurl='https://gitlab.gnome.org/api/v4') - >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, - instance='gitlab.inria.fr', api_baseurl='https://gitlab.inria.fr/api/v4') - >>> + {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc'}) + >>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task( + {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc'}) + >>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task( + {'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', + 'sort': 'asc'}) diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py index e480994..c54063b 100644 --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -1,27 +1,27 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import (IndexingDiscoveryListerTask, RangeListerTask, IndexingRefreshListerTask, ListerTaskBase) from .lister import BitBucketLister class BitBucketListerTask(ListerTaskBase): - def new_lister(self): - return BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0') + def new_lister(self, *, api_baseurl='https://api.bitbucket.org/2.0'): + return BitBucketLister(api_baseurl=api_baseurl) class IncrementalBitBucketLister(BitBucketListerTask, IndexingDiscoveryListerTask): task_queue = 'swh_lister_bitbucket_discover' class RangeBitBucketLister(BitBucketListerTask, RangeListerTask): task_queue = 'swh_lister_bitbucket_refresh' class FullBitBucketRelister(BitBucketListerTask, IndexingRefreshListerTask): task_queue = 'swh_lister_bitbucket_refresh' diff --git a/swh/lister/core/tasks.py b/swh/lister/core/tasks.py index a8305e7..c3deb8b 100644 --- a/swh/lister/core/tasks.py +++ b/swh/lister/core/tasks.py @@ -1,89 +1,95 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import random from celery import group from swh.scheduler.task import Task, TaskType from .abstractattribute import AbstractAttribute class AbstractTaskMeta(abc.ABCMeta, TaskType): pass class ListerTaskBase(Task, metaclass=AbstractTaskMeta): """Lister Tasks define the process of periodically requesting batches of repository information from source code hosting services. They instantiate Listers to do batches of work at periodic intervals. There are two main kinds of lister tasks: 1. Discovering new repositories. 2. Refreshing the list of already discovered repositories. If the hosting service is indexable (according to the requirements of :class:`SWHIndexingLister`), then we can optionally partition the set of known repositories into sub-sets to distribute the work. This means that there is a third possible Task type for Indexing Listers: 3. Discover or refresh a specific range of indices. """ task_queue = AbstractAttribute('Celery Task queue name') @abc.abstractmethod - def new_lister(self, *args, **kwargs): + def new_lister(self, **lister_args): """Return a new lister of the appropriate type. """ pass @abc.abstractmethod - def run_task(self, *args, **kwargs): + def run_task(self, *, lister_args=None): pass # Paging/Indexing lister tasks derivatives # (cf. {github/bitbucket/gitlab}/tasks) class RangeListerTask(ListerTaskBase): """Range lister task. """ - def run_task(self, start, end, *args, **kwargs): - lister = self.new_lister(*args, **kwargs) + def run_task(self, start, end, lister_args=None): + if lister_args is None: + lister_args = {} + lister = self.new_lister(**lister_args) return lister.run(min_bound=start, max_bound=end) # Indexing Lister tasks derivatives (cf. {github/bitbucket}/tasks) class IndexingDiscoveryListerTask(ListerTaskBase): """Incremental indexing lister task. """ - def run_task(self, *args, **kwargs): - lister = self.new_lister(*args, **kwargs) + def run_task(self, *, lister_args=None): + if lister_args is None: + lister_args = {} + lister = self.new_lister(**lister_args) return lister.run(min_bound=lister.db_last_index(), max_bound=None) class IndexingRefreshListerTask(ListerTaskBase): """Full indexing lister task. """ GROUP_SPLIT = 10000 - def run_task(self, *args, **kwargs): - lister = self.new_lister(*args, **kwargs) + def run_task(self, *, lister_args=None): + if lister_args is None: + lister_args = {} + lister = self.new_lister(**lister_args) ranges = lister.db_partition_indices(self.GROUP_SPLIT) random.shuffle(ranges) range_task = RangeListerTask() - group(range_task.s(minv, maxv, *args, **kwargs) + group(range_task.s(minv, maxv, lister_args) for minv, maxv in ranges)() diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py index cdac167..0ddb653 100644 --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -1,18 +1,18 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import ListerTaskBase from .lister import DebianLister class DebianListerTask(ListerTaskBase): task_queue = 'swh_lister_debian' def new_lister(self): return DebianLister() def run_task(self, distribution): lister = self.new_lister() return lister.run(distribution) diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py index ba04e8c..c2e841e 100644 --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -1,26 +1,26 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import (IndexingDiscoveryListerTask, RangeListerTask, IndexingRefreshListerTask, ListerTaskBase) from .lister import GitHubLister class GitHubListerTask(ListerTaskBase): - def new_lister(self): - return GitHubLister(api_baseurl='https://api.github.com') + def new_lister(self, *, api_baseurl='https://api.github.com'): + return GitHubLister(api_baseurl=api_baseurl) class IncrementalGitHubLister(GitHubListerTask, IndexingDiscoveryListerTask): task_queue = 'swh_lister_github_discover' class RangeGitHubLister(GitHubListerTask, RangeListerTask): task_queue = 'swh_lister_github_refresh' class FullGitHubRelister(GitHubListerTask, IndexingRefreshListerTask): task_queue = 'swh_lister_github_refresh' diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py index 9aee77c..7d46078 100644 --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -1,63 +1,63 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random from celery import group from .. import utils from ..core.tasks import ListerTaskBase, RangeListerTask from .lister import GitLabLister class GitLabListerTask(ListerTaskBase): - def new_lister(self, api_baseurl='https://gitlab.com/api/v4', - instance='gitlab.com'): - return GitLabLister(api_baseurl=api_baseurl, instance=instance) + def new_lister(self, *, api_baseurl='https://gitlab.com/api/v4', + instance='gitlab.com', sort='asc'): + return GitLabLister( + api_baseurl=api_baseurl, instance=instance, sort=sort) class RangeGitLabLister(GitLabListerTask, RangeListerTask): """Range GitLab lister (list available origins on specified range) """ task_queue = 'swh_lister_gitlab_refresh' class FullGitLabRelister(GitLabListerTask): """Full GitLab lister (list all available origins from the api). """ task_queue = 'swh_lister_gitlab_refresh' # nb pages nb_pages = 10 - def run_task(self, *args, **kwargs): - lister = self.new_lister(*args, **kwargs) + def run_task(self, lister_args=None): + if lister_args is None: + lister_args = {} + lister = self.new_lister(**lister_args) _, total_pages, _ = lister.get_pages_information() ranges = list(utils.split_range(total_pages, self.nb_pages)) random.shuffle(ranges) range_task = RangeGitLabLister() - group(range_task.s(minv, maxv, *args, **kwargs) + group(range_task.s(minv, maxv, lister_args=lister_args) for minv, maxv in ranges)() -class IncrementalGitLabLister(ListerTaskBase): +class IncrementalGitLabLister(GitLabListerTask): """Incremental GitLab lister (list only new available origins). """ task_queue = 'swh_lister_gitlab_discover' - def new_lister(self, api_baseurl='https://gitlab.com/api/v4', - instance='gitlab.com'): - # assuming going forward in desc order, page 1 through - return GitLabLister(instance=instance, api_baseurl=api_baseurl, - sort='desc') - - def run_task(self, *args, **kwargs): - lister = self.new_lister(*args, **kwargs) + def run_task(self, lister_args=None): + if lister_args is None: + lister_args = {} + lister_args['sort'] = 'desc' + lister = self.new_lister(**lister_args) _, total_pages, _ = lister.get_pages_information() # stopping as soon as existing origins for that instance are detected return lister.run(min_bound=1, max_bound=total_pages, check_existence=True)