diff --git a/requirements-swh.txt b/requirements-swh.txt index 7fa659d..7247006 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ -swh.core +swh.core >= 0.0.73 swh.storage[schemata] >= 0.0.122 swh.scheduler >= 0.0.58 diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py index e69de29..5697901 100644 --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -0,0 +1,43 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +import pkg_resources + + +logger = logging.getLogger(__name__) + + +LISTERS = {entry_point.name.split('.', 1)[1]: entry_point + for entry_point in pkg_resources.iter_entry_points('swh.workers') + if entry_point.name.split('.', 1)[0] == 'lister'} + + +SUPPORTED_LISTERS = list(LISTERS) + + +def get_lister(lister_name, db_url=None, **conf): + """Instantiate a lister given its name. + + Args: + lister_name (str): Lister's name + conf (dict): Configuration dict (lister db cnx, policy, priority...) + + Returns: + Tuple (instantiated lister, drop_tables function, init schema function, + insert minimum data function) + + """ + if lister_name not in LISTERS: + raise ValueError( + 'Invalid lister %s: only supported listers are %s' % + (lister_name, SUPPORTED_LISTERS)) + if db_url: + conf['lister'] = {'cls': 'local', 'args': {'db': db_url}} + + registry_entry = LISTERS[lister_name].load()() + lister_cls = registry_entry['lister'] + lister = lister_cls(override_config=conf) + return lister diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index c459eb5..d6c2077 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,138 +1,138 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import logging from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup from requests import Session from requests.adapters import HTTPAdapter from .models import CGitModel from swh.core.utils import grouper from swh.lister.core.lister_base import ListerBase logger = logging.getLogger(__name__) class CGitLister(ListerBase): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. For each found git repository, a query is made at the given url found in this index to gather published "Clone" URLs to be used as origin URL for that git repo. If several "Clone" urls are provided, prefer the http/https one, if any, otherwise fall bak to the first one. A loader task is created for each git repository: Task: Type: load-git Policy: recurring Args: Example: Type: load-git Policy: recurring Args: 'https://git.savannah.gnu.org/git/elisp-es.git' """ MODEL = CGitModel - DEFAULT_URL = 'http://git.savannah.gnu.org/cgit/' + DEFAULT_URL = 'https://git.savannah.gnu.org/cgit/' LISTER_NAME = 'cgit' url_prefix_present = True def __init__(self, url=None, instance=None, override_config=None): """Lister class for CGit repositories. Args: url (str): main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. instance (str): Name of cgit instance. Defaults to url's hostname if unset. """ super().__init__(override_config=override_config) if url is None: url = self.config.get('url', self.DEFAULT_URL) self.url = url if not instance: instance = urlparse(url).hostname self.instance = instance self.session = Session() self.session.mount(self.url, HTTPAdapter(max_retries=3)) def run(self): total = 0 for repos in grouper(self.get_repos(), 10): models = list(filter(None, (self.build_model(repo) for repo in repos))) injected_repos = self.inject_repo_data_into_db(models) self.schedule_missing_tasks(models, injected_repos) self.db_session.commit() total += len(injected_repos) logger.debug('Scheduled %s tasks for %s', total, self.url) def get_repos(self): """Generate git 'project' URLs found on the current CGit server """ next_page = self.url while next_page: bs_idx = self.get_and_parse(next_page) for tr in bs_idx.find( 'div', {"class": "content"}).find_all( "tr", {"class": ""}): yield urljoin(self.url, tr.find('a')['href']) try: pager = bs_idx.find('ul', {'class': 'pager'}) current_page = pager.find('a', {'class': 'current'}) if current_page: next_page = current_page.parent.next_sibling.a['href'] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page next_page = None def build_model(self, repo_url): """Given the URL of a git repo project page on a CGit server, return the repo description (dict) suitable for insertion in the db. """ bs = self.get_and_parse(repo_url) urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] if not urls: return # look for the http/https url, if any, and use it as origin_url for url in urls: if urlparse(url).scheme in ('http', 'https'): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] return {'uid': repo_url, 'name': bs.find('a', title=re.compile('.+'))['title'], 'origin_type': 'git', 'instance': self.instance, 'origin_url': origin_url, } def get_and_parse(self, url): "Get the given url and parse the retrieved HTML using BeautifulSoup" return BeautifulSoup(self.session.get(url).text, features='html.parser') diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py index 8524c9b..b7887f0 100644 --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -1,86 +1,66 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from os.path import join, dirname -import re -from urllib.parse import urlparse -from unittest.mock import Mock -import requests_mock -from sqlalchemy import create_engine +def test_lister_no_page(requests_mock_datadir, swh_listers): + lister = swh_listers['cgit'] -from swh.lister.cgit.lister import CGitLister -from swh.lister.tests.test_utils import init_db + assert lister.url == 'https://git.savannah.gnu.org/cgit/' + repos = list(lister.get_repos()) + assert len(repos) == 977 -DATADIR = join(dirname(__file__), 'data') + assert repos[0] == 'https://git.savannah.gnu.org/cgit/elisp-es.git/' + # note the url below is NOT a subpath of /cgit/ + assert repos[-1] == 'https://git.savannah.gnu.org/path/to/yetris.git/' # noqa + # note the url below is NOT on the same server + assert repos[-2] == 'http://example.org/cgit/xstarcastle.git/' -def get_response_cb(request, context): - url = urlparse(request.url) - dirname = url.hostname - filename = url.path[1:-1].replace('/', '_') - if url.query: - filename += ',' + url.query - resp = open(join(DATADIR, dirname, filename), 'rb').read() - return resp.decode('ascii', 'ignore') +def test_lister_model(requests_mock_datadir, swh_listers): + lister = swh_listers['cgit'] + repo = next(lister.get_repos()) -def test_lister_no_page(): - with requests_mock.Mocker() as m: - m.get(re.compile('http://git.savannah.gnu.org'), text=get_response_cb) - lister = CGitLister() + model = lister.build_model(repo) + assert model == { + 'uid': 'https://git.savannah.gnu.org/cgit/elisp-es.git/', + 'name': 'elisp-es.git', + 'origin_type': 'git', + 'instance': 'git.savannah.gnu.org', + 'origin_url': 'https://git.savannah.gnu.org/git/elisp-es.git' + } - assert lister.url == 'http://git.savannah.gnu.org/cgit/' - repos = list(lister.get_repos()) - assert len(repos) == 977 +def test_lister_with_pages(requests_mock_datadir, swh_listers): + lister = swh_listers['cgit'] + lister.url = 'https://git.tizen/cgit/' - assert repos[0] == 'http://git.savannah.gnu.org/cgit/elisp-es.git/' - # note the url below is NOT a subpath of /cgit/ - assert repos[-1] == 'http://git.savannah.gnu.org/path/to/yetris.git/' # noqa - # note the url below is NOT on the same server - assert repos[-2] == 'http://example.org/cgit/xstarcastle.git/' + repos = list(lister.get_repos()) + # we should have 16 repos (listed on 3 pages) + assert len(repos) == 16 -def test_lister_model(): - with requests_mock.Mocker() as m: - m.get(re.compile('http://git.savannah.gnu.org'), text=get_response_cb) - lister = CGitLister() +def test_lister_run(requests_mock_datadir, swh_listers): + lister = swh_listers['cgit'] + lister.url = 'https://git.tizen/cgit/' + lister.run() - repo = next(lister.get_repos()) + r = lister.scheduler.search_tasks(task_type='load-git') + assert len(r) == 16 - model = lister.build_model(repo) - assert model == { - 'uid': 'http://git.savannah.gnu.org/cgit/elisp-es.git/', - 'name': 'elisp-es.git', - 'origin_type': 'git', - 'instance': 'git.savannah.gnu.org', - 'origin_url': 'https://git.savannah.gnu.org/git/elisp-es.git' - } + for row in r: + assert row['type'] == 'load-git' + # arguments check + args = row['arguments']['args'] + assert len(args) == 1 + url = args[0] + assert url.startswith('https://git.tizen') -def test_lister_with_pages(): - with requests_mock.Mocker() as m: - m.get(re.compile('http://git.tizen/cgit/'), text=get_response_cb) - lister = CGitLister(url='http://git.tizen/cgit/') - - assert lister.url == 'http://git.tizen/cgit/' - - repos = list(lister.get_repos()) - # we should have 16 repos (listed on 3 pages) - assert len(repos) == 16 - - -def test_lister_run(): - with requests_mock.Mocker() as m: - m.get(re.compile('http://git.tizen/cgit/'), text=get_response_cb) - db = init_db() - conf = {'lister': {'cls': 'local', 'args': {'db': db.url()}}} - lister = CGitLister(url='http://git.tizen/cgit/', - override_config=conf) - engine = create_engine(db.url()) - lister.MODEL.metadata.create_all(engine) - lister.schedule_missing_tasks = Mock(return_value=None) - lister.run() + # kwargs + kwargs = row['arguments']['kwargs'] + assert kwargs == {} + assert row['policy'] == 'recurring' + assert row['priority'] is None diff --git a/swh/lister/cli.py b/swh/lister/cli.py index 9e1dad4..a5ed46c 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,235 +1,206 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging -import pkg_resources from copy import deepcopy from importlib import import_module import click from sqlalchemy import create_engine from swh.core.cli import CONTEXT_SETTINGS from swh.scheduler import get_scheduler from swh.scheduler.task import SWHTask +from swh.lister import get_lister, SUPPORTED_LISTERS, LISTERS from swh.lister.core.models import initialize logger = logging.getLogger(__name__) -LISTERS = {entry_point.name.split('.', 1)[1]: entry_point - for entry_point in pkg_resources.iter_entry_points('swh.workers') - if entry_point.name.split('.', 1)[0] == 'lister'} -SUPPORTED_LISTERS = list(LISTERS) # the key in this dict is the suffix used to match new task-type to be added. # For example for a task which function name is "list_gitlab_full', the default # value used when inserting a new task-type in the scheduler db will be the one # under the 'full' key below (because it matches xxx_full). DEFAULT_TASK_TYPE = { 'full': { # for tasks like 'list_xxx_full()' 'default_interval': '90 days', 'min_interval': '90 days', 'max_interval': '90 days', 'backoff_factor': 1 }, '*': { # value if not suffix matches 'default_interval': '1 day', 'min_interval': '1 day', 'max_interval': '1 day', 'backoff_factor': 1 }, } -def get_lister(lister_name, db_url=None, **conf): - """Instantiate a lister given its name. - - Args: - lister_name (str): Lister's name - conf (dict): Configuration dict (lister db cnx, policy, priority...) - - Returns: - Tuple (instantiated lister, drop_tables function, init schema function, - insert minimum data function) - - """ - if lister_name not in LISTERS: - raise ValueError( - 'Invalid lister %s: only supported listers are %s' % - (lister_name, SUPPORTED_LISTERS)) - if db_url: - conf['lister'] = {'cls': 'local', 'args': {'db': db_url}} - - registry_entry = LISTERS[lister_name].load()() - lister_cls = registry_entry['lister'] - lister = lister_cls(override_config=conf) - return lister - - @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help="Configuration file.") @click.option('--db-url', '-d', default=None, help='SQLAlchemy DB URL; see ' '') # noqa @click.pass_context def lister(ctx, config_file, db_url): '''Software Heritage Lister tools.''' from swh.core import config ctx.ensure_object(dict) override_conf = {} if db_url: override_conf['lister'] = { 'cls': 'local', 'args': {'db': db_url} } if not config_file: config_file = os.environ.get('SWH_CONFIG_FILENAME') conf = config.read(config_file, override_conf) ctx.obj['config'] = conf ctx.obj['override_conf'] = override_conf @lister.command(name='db-init', context_settings=CONTEXT_SETTINGS) @click.option('--drop-tables', '-D', is_flag=True, default=False, help='Drop tables before creating the database schema') @click.pass_context def db_init(ctx, drop_tables): """Initialize the database model for given listers. """ cfg = ctx.obj['config'] lister_cfg = cfg['lister'] if lister_cfg['cls'] != 'local': click.echo('A local lister configuration is required') ctx.exit(1) db_url = lister_cfg['args']['db'] db_engine = create_engine(db_url) registry = {} for lister, entrypoint in LISTERS.items(): logger.info('Loading lister %s', lister) registry[lister] = entrypoint.load()() logger.info('Initializing database') initialize(db_engine, drop_tables) for lister, entrypoint in LISTERS.items(): registry_entry = registry[lister] init_hook = registry_entry.get('init') if callable(init_hook): logger.info('Calling init hook for %s', lister) init_hook(db_engine) @lister.command(name='register-task-types', context_settings=CONTEXT_SETTINGS) @click.option('--lister', '-l', 'listers', multiple=True, default=('all', ), show_default=True, help='Only registers task-types for these listers', type=click.Choice(['all'] + SUPPORTED_LISTERS)) @click.pass_context def register_task_types(ctx, listers): """Insert missing task-type entries in the scheduler According to declared tasks in each loaded lister plugin. """ cfg = ctx.obj['config'] scheduler = get_scheduler(**cfg['scheduler']) for lister, entrypoint in LISTERS.items(): if 'all' not in listers and lister not in listers: continue logger.info('Loading lister %s', lister) registry_entry = entrypoint.load()() for task_module in registry_entry['task_modules']: mod = import_module(task_module) for task_name in (x for x in dir(mod) if not x.startswith('_')): taskobj = getattr(mod, task_name) if isinstance(taskobj, SWHTask): task_type = task_name.replace('_', '-') task_cfg = registry_entry.get('task_types', {}).get( task_type, {}) ensure_task_type(task_type, taskobj, task_cfg, scheduler) def ensure_task_type(task_type, swhtask, task_config, scheduler): """Ensure a task-type is known by the scheduler Args: task_type (str): the type of the task to check/insert (correspond to the 'type' field in the db) swhtask (SWHTask): the SWHTask instance the task-type correspond to task_config (dict): a dict with specific/overloaded values for the task-type to be created scheduler: the scheduler object used to access the scheduler db """ for suffix, defaults in DEFAULT_TASK_TYPE.items(): if task_type.endswith('-' + suffix): task_type_dict = defaults.copy() break else: task_type_dict = DEFAULT_TASK_TYPE['*'].copy() task_type_dict['type'] = task_type task_type_dict['backend_name'] = swhtask.name if swhtask.__doc__: task_type_dict['description'] = swhtask.__doc__.splitlines()[0] task_type_dict.update(task_config) current_task_type = scheduler.get_task_type(task_type) if current_task_type: # check some stuff if current_task_type['backend_name'] != task_type_dict['backend_name']: logger.warning('Existing task type %s for lister %s has a ' 'different backend name than current ' 'code version provides (%s vs. %s)', task_type, lister, current_task_type['backend_name'], task_type_dict['backend_name'], ) else: logger.info('Create task type %s in scheduler', task_type) logger.debug(' %s', task_type_dict) scheduler.create_task_type(task_type_dict) @lister.command(name='run', context_settings=CONTEXT_SETTINGS, help='Trigger a full listing run for a particular forge ' 'instance. The output of this listing results in ' '"oneshot" tasks in the scheduler db with a priority ' 'defined by the user') @click.option('--lister', '-l', help='Lister to run', type=click.Choice(SUPPORTED_LISTERS)) @click.option('--priority', '-p', default='high', type=click.Choice(['high', 'medium', 'low']), help='Task priority for the listed repositories to ingest') @click.argument('options', nargs=-1) @click.pass_context def run(ctx, lister, priority, options): from swh.scheduler.cli.utils import parse_options config = deepcopy(ctx.obj['config']) if options: config.update(parse_options(options)[1]) config['priority'] = priority config['policy'] = 'oneshot' get_lister(lister, **config).run() if __name__ == '__main__': lister() diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py index 32d03d4..34eefa4 100644 --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -1 +1,28 @@ from swh.scheduler.tests.conftest import * # noqa + +import pytest + +from sqlalchemy import create_engine + +from swh.lister import get_lister, SUPPORTED_LISTERS +from swh.lister.core.models import initialize + + +@pytest.fixture +def swh_listers(request, postgresql_proc, postgresql, swh_scheduler): + db_url = 'postgresql://{user}@{host}:{port}/{dbname}'.format( + host=postgresql_proc.host, + port=postgresql_proc.port, + user='postgres', + dbname='tests') + + listers = {} + + # Prepare schema for all listers + for lister_name in SUPPORTED_LISTERS: + lister = get_lister(lister_name, db_url=db_url) + lister.scheduler = swh_scheduler # inject scheduler fixture + listers[lister_name] = lister + initialize(create_engine(db_url), drop_tables=True) + + return listers