diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index 55188fd..6f814ef 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,229 +1,232 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import random from datetime import datetime from email.utils import parsedate from pprint import pformat import logging import requests import xmltodict try: from swh.lister._version import __version__ except ImportError: __version__ = 'devel' from .abstractattribute import AbstractAttribute from .lister_base import FetchError logger = logging.getLogger(__name__) class ListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. To be used in conjunction with ListerBase or a subclass of it. """ DEFAULT_URL = None PATH_TEMPLATE = AbstractAttribute('string containing a python string' ' format pattern that produces the API' ' endpoint path for listing stored' ' repositories when given an index.' ' eg. "/repositories?after=%s".' 'To be implemented in the API-specific' ' class inheriting this.') EXPECTED_STATUS_CODES = (200, 429, 403, 404) def request_headers(self): """Returns dictionary of any request headers needed by the server. MAY BE OVERRIDDEN if request headers are needed. """ return { 'User-Agent': 'Software Heritage lister (%s)' % self.lister_version } def request_instance_credentials(self): """Returns dictionary of any credentials configuration needed by the forge instance to list. The 'credentials' configuration is expected to be a dict of multiple levels. The first level is the lister's name, the second is the lister's instance name, which value is expected to be a list of credential structures (typically a couple username/password). For example: credentials: github: # github lister github: # has only one instance (so far) - username: some password: somekey - username: one password: onekey - ... gitlab: # gitlab lister riseup: # has many instances - username: someone password: ... - ... gitlab: - username: someone password: ... - ... Returns: list of credential dicts for the current lister. """ all_creds = self.config.get('credentials') if not all_creds: return [] lister_creds = all_creds.get(self.LISTER_NAME, {}) creds = lister_creds.get(self.instance, []) return creds def request_uri(self, identifier): """Get the full request URI given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the PATH_TEMPLATE is required. """ path = self.PATH_TEMPLATE % identifier return self.url + path def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. This uses credentials if any are provided (see request_instance_credentials). MAY BE OVERRIDDEN if something more complex than the request headers is needed. """ params = {} params['headers'] = self.request_headers() or {} creds = self.request_instance_credentials() if not creds: return params auth = random.choice(creds) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params def transport_quota_check(self, response): """Implements ListerBase.transport_quota_check with standard 429 code check for HTTP with Requests library. MAY BE OVERRIDDEN if the server notifies about rate limits in a non-standard way that doesn't use HTTP 429 and the Retry-After response header. ( https://tools.ietf.org/html/rfc6585#section-4 ) """ if response.status_code == 429: # HTTP too many requests retry_after = response.headers.get('Retry-After', self.back_off()) try: # might be seconds return True, float(retry_after) except Exception: # might be http-date at_date = datetime(*parsedate(retry_after)[:6]) from_now = (at_date - datetime.today()).total_seconds() + 5 return True, max(0, from_now) else: # response ok self.reset_backoff() return False, 0 def __init__(self, url=None): if not url: url = self.config.get('url') if not url: url = self.DEFAULT_URL if not url: raise NameError('HTTP Lister Transport requires an url.') self.url = url # eg. 'https://api.github.com' self.session = requests.Session() self.lister_version = __version__ def _transport_action(self, identifier, method='get'): """Permit to ask information to the api prior to actually executing query. """ path = self.request_uri(identifier) params = self.request_params(identifier) + logger.debug('path: %s', path) + logger.debug('params: %s', params) + logger.debug('method: %s', method) try: if method == 'head': response = self.session.head(path, **params) else: response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: logger.warning('Failed to fetch %s: %s', path, e) raise FetchError(e) else: if response.status_code not in self.EXPECTED_STATUS_CODES: raise FetchError(response) return response def transport_head(self, identifier): """Retrieve head information on api. """ return self._transport_action(identifier, method='head') def transport_request(self, identifier): """Implements ListerBase.transport_request for HTTP using Requests. Retrieve get information on api. """ return self._transport_action(identifier) def transport_response_to_string(self, response): """Implements ListerBase.transport_response_to_string for HTTP given Requests responses. """ s = pformat(response.request.path_url) s += '\n#\n' + pformat(response.request.headers) s += '\n#\n' + pformat(response.status_code) s += '\n#\n' + pformat(response.headers) s += '\n#\n' try: # json? s += pformat(response.json()) except Exception: # not json try: # xml? s += pformat(xmltodict.parse(response.text)) except Exception: # not xml s += pformat(response.text) return s class ListerOnePageApiTransport(ListerHttpTransport): """Leverage requests library to retrieve basic html page and parse result. To be used in conjunction with ListerBase or a subclass of it. """ PAGE = AbstractAttribute("The server api's unique page to retrieve and " "parse for information") PATH_TEMPLATE = None # we do not use it def __init__(self, url=None): self.session = requests.Session() self.lister_version = __version__ def request_uri(self, _): """Get the full request URI given the transport_request identifier. """ return self.PAGE diff --git a/swh/lister/phabricator/tests/conftest.py b/swh/lister/phabricator/tests/conftest.py index 507fef9..22de766 100644 --- a/swh/lister/phabricator/tests/conftest.py +++ b/swh/lister/phabricator/tests/conftest.py @@ -1 +1,26 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + from swh.lister.core.tests.conftest import * # noqa + + +@pytest.fixture +def lister_phabricator(swh_listers): + lister = swh_listers['phabricator'] + + # Amend the credentials + lister.config = { + 'cache_responses': False, + 'credentials': { + 'phabricator': { + lister.instance: [{ + 'password': 'foo' + }] + }} + } + + return lister diff --git a/swh/lister/phabricator/tests/api_empty_response.json b/swh/lister/phabricator/tests/data/api_empty_response.json similarity index 100% rename from swh/lister/phabricator/tests/api_empty_response.json rename to swh/lister/phabricator/tests/data/api_empty_response.json diff --git a/swh/lister/phabricator/tests/api_first_response.json b/swh/lister/phabricator/tests/data/api_first_response.json similarity index 100% rename from swh/lister/phabricator/tests/api_first_response.json rename to swh/lister/phabricator/tests/data/api_first_response.json diff --git a/swh/lister/phabricator/tests/api_first_response_other_instance.json b/swh/lister/phabricator/tests/data/api_first_response_other_instance.json similarity index 100% rename from swh/lister/phabricator/tests/api_first_response_other_instance.json rename to swh/lister/phabricator/tests/data/api_first_response_other_instance.json diff --git a/swh/lister/phabricator/tests/api_next_response.json b/swh/lister/phabricator/tests/data/api_next_response.json similarity index 100% rename from swh/lister/phabricator/tests/api_next_response.json rename to swh/lister/phabricator/tests/data/api_next_response.json diff --git a/swh/lister/phabricator/tests/api_response_undefined_protocol.json b/swh/lister/phabricator/tests/data/api_response_undefined_protocol.json similarity index 100% rename from swh/lister/phabricator/tests/api_response_undefined_protocol.json rename to swh/lister/phabricator/tests/data/api_response_undefined_protocol.json diff --git a/swh/lister/phabricator/tests/data/forge.softwareheritage.org/api_diffusion.repository.search,order=oldest,attachments%5Buris%5D=1,after=,api.token=foo b/swh/lister/phabricator/tests/data/forge.softwareheritage.org/api_diffusion.repository.search,order=oldest,attachments%5Buris%5D=1,after=,api.token=foo new file mode 120000 index 0000000..9414cb8 --- /dev/null +++ b/swh/lister/phabricator/tests/data/forge.softwareheritage.org/api_diffusion.repository.search,order=oldest,attachments%5Buris%5D=1,after=,api.token=foo @@ -0,0 +1 @@ +../api_first_response.json \ No newline at end of file diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index 78f0d1c..a433226 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,109 +1,142 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import json +import logging import unittest import requests_mock from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.phabricator.lister import PhabricatorLister from swh.lister.phabricator.lister import get_repo_url +logger = logging.getLogger(__name__) + + class PhabricatorListerTester(HttpListerTester, unittest.TestCase): Lister = PhabricatorLister # first request will have the after parameter empty test_re = re.compile(r'\&after=([^?&]*)') lister_subdir = 'phabricator' - good_api_response_file = 'api_first_response.json' - good_api_response_undefined_protocol = 'api_response_undefined_'\ - 'protocol.json' - bad_api_response_file = 'api_empty_response.json' + good_api_response_file = 'data/api_first_response.json' + good_api_response_undefined_protocol = \ + 'data/api_response_undefined_protocol.json' + bad_api_response_file = 'data/api_empty_response.json' # first_index must be retrieved through a bootstrap process for Phabricator first_index = None last_index = 12 entries_per_page = 10 convert_type = int def request_index(self, request): """(Override) This is needed to emulate the listing bootstrap when no min_bound is provided to run """ m = self.test_re.search(request.path_url) idx = m.group(1) if idx not in ('', 'None'): return int(idx) def get_fl(self, override_config=None): """(Override) Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: credentials = {'phabricator': {'fake': [ {'password': 'toto'} ]}} override_config = dict(credentials=credentials, **(override_config or {})) self.fl = self.Lister(url='https://fakeurl', instance='fake', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def test_get_repo_url(self): f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file)) api_response = json.load(f) repos = api_response['result']['data'] for repo in repos: self.assertEqual( 'https://forge.softwareheritage.org/source/%s.git' % (repo['fields']['shortName']), get_repo_url(repo['attachments']['uris']['uris'])) f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_undefined_protocol)) repo = json.load(f) self.assertEqual( 'https://svn.blender.org/svnroot/bf-blender/', get_repo_url(repo['attachments']['uris']['uris'])) @requests_mock.Mocker() def test_scheduled_tasks(self, http_mocker): - self.scheduled_tasks_test('api_next_response.json', 23, http_mocker) + self.scheduled_tasks_test('data/api_next_response.json', 23, + http_mocker) @requests_mock.Mocker() def test_scheduled_tasks_multiple_instances(self, http_mocker): fl = self.create_fl_with_db(http_mocker) # list first Phabricator instance fl.run() fl.instance = 'other_fake' fl.config['credentials'] = { 'phabricator': { 'other_fake': [{ 'password': 'foo' }] } } # list second Phabricator instance hosting repositories having # same ids as those listed from the first instance - self.good_api_response_file = 'api_first_response_other_instance.json' + self.good_api_response_file = \ + 'data/api_first_response_other_instance.json' self.last_index = 13 fl.run() # check expected number of loading tasks self.assertEqual(len(self.scheduler_tasks), 2 * self.entries_per_page) # check tasks are not disabled for task in self.scheduler_tasks: self.assertTrue(task['status'] != 'disabled') + + +def test_phabricator_lister(lister_phabricator, requests_mock_datadir): + lister = lister_phabricator + assert lister.url == lister.DEFAULT_URL + assert lister.instance == 'forge.softwareheritage.org' + lister.run() + + r = lister.scheduler.search_tasks(task_type='load-git') + assert len(r) == 10 + + for row in r: + assert row['type'] == 'load-git' + # arguments check + args = row['arguments']['args'] + assert len(args) == 1 + + url = args[0] + assert lister.instance in url + + # kwargs + kwargs = row['arguments']['kwargs'] + assert kwargs == {} + + assert row['policy'] == 'recurring' + assert row['priority'] is None