diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index ebf5fcf..3a852d7 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,152 +1,151 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import random import urllib.parse from swh.lister.core.indexing_lister import IndexingHttpLister from swh.lister.phabricator.models import PhabricatorModel from collections import defaultdict logger = logging.getLogger(__name__) class PhabricatorLister(IndexingHttpLister): PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' DEFAULT_URL = 'https://forge.softwareheritage.org/api/diffusion.repository.search' # noqa MODEL = PhabricatorModel LISTER_NAME = 'phabricator' def __init__(self, url=None, instance=None, override_config=None): super().__init__(url=url, override_config=override_config) if not instance: instance = urllib.parse.urlparse(self.url).hostname self.instance = instance @property def default_min_bound(self): """Starting boundary when `min_bound` is not defined (db empty). This is used within the fn:`run` call. """ return self._bootstrap_repositories_listing() def request_params(self, identifier): """Override the default params behavior to retrieve the api token Credentials are stored as: credentials: phabricator: : - username: password: """ creds = self.request_instance_credentials() if not creds: raise ValueError( 'Phabricator forge needs authentication credential to list.') api_token = random.choice(creds)['password'] return {'headers': self.request_headers() or {}, 'params': {'api.token': api_token}} def request_headers(self): """ (Override) Set requests headers to send when querying the Phabricator API """ return {'User-Agent': 'Software Heritage phabricator lister', 'Accept': 'application/json'} def get_model_from_repo(self, repo): url = get_repo_url(repo['attachments']['uris']['uris']) if url is None: return None return { 'uid': url, 'indexable': repo['id'], 'name': repo['fields']['shortName'], 'full_name': repo['fields']['name'], 'html_url': url, 'origin_url': url, 'origin_type': repo['fields']['vcs'], 'instance': self.instance, } def get_next_target_from_response(self, response): body = response.json()['result']['cursor'] - if body['after'] != 'null': - return body['after'] - return None + if body['after'] and body['after'] != 'null': + return int(body['after']) def transport_response_simplified(self, response): repos = response.json() if repos['result'] is None: raise ValueError( 'Problem during information fetch: %s' % repos['error_code']) repos = repos['result']['data'] return [self.get_model_from_repo(repo) for repo in repos] def filter_before_inject(self, models_list): """ (Overrides) IndexingLister.filter_before_inject Bounds query results by this Lister's set max_index. """ models_list = [m for m in models_list if m is not None] return super().filter_before_inject(models_list) def _bootstrap_repositories_listing(self): """ Method called when no min_bound value has been provided to the lister. Its purpose is to: 1. get the first repository data hosted on the Phabricator instance 2. inject them into the lister database 3. return the first repository index to start the listing after that value Returns: int: The first repository index """ params = '&order=oldest&limit=1' response = self.safely_issue_request(params) models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) injected = self.inject_repo_data_into_db(models_list) self.schedule_missing_tasks(models_list, injected) return models_list[0]['indexable'] def get_repo_url(attachments): """ Return url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id """ processed_urls = defaultdict(dict) for uri in attachments: protocol = uri['fields']['builtin']['protocol'] url = uri['fields']['uri']['effective'] identifier = uri['fields']['builtin']['identifier'] if protocol in ('http', 'https'): processed_urls[protocol][identifier] = url elif protocol is None: for protocol in ('https', 'http'): if url.startswith(protocol): processed_urls[protocol]['undefined'] = url break for protocol in ['https', 'http']: for identifier in ['shortname', 'callsign', 'id', 'undefined']: if (protocol in processed_urls and identifier in processed_urls[protocol]): return processed_urls[protocol][identifier] return None diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index bd2297a..4b59d16 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,86 +1,97 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import json import unittest import requests_mock from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.phabricator.lister import PhabricatorLister from swh.lister.phabricator.lister import get_repo_url class PhabricatorListerTester(HttpListerTester, unittest.TestCase): Lister = PhabricatorLister # first request will have the after parameter empty test_re = re.compile(r'\&after=([^?&]*)') lister_subdir = 'phabricator' good_api_response_file = 'api_response.json' good_api_response_undefined_protocol = 'api_response_undefined_'\ 'protocol.json' bad_api_response_file = 'api_empty_response.json' # first_index must be retrieved through a bootstrap process for Phabricator first_index = None last_index = 12 entries_per_page = 10 convert_type = int def request_index(self, request): """(Override) This is needed to emulate the listing bootstrap when no min_bound is provided to run """ m = self.test_re.search(request.path_url) idx = m.group(1) if idx == str(self.last_index): return int(idx) def get_fl(self, override_config=None): """(Override) Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: credentials = {'phabricator': {'fake': [ {'password': 'toto'} ]}} override_config = dict(credentials=credentials, **(override_config or {})) self.fl = self.Lister(url='https://fakeurl', instance='fake', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def test_get_repo_url(self): f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file)) api_response = json.load(f) repos = api_response['result']['data'] for repo in repos: self.assertEqual( 'https://forge.softwareheritage.org/source/%s.git' % (repo['fields']['shortName']), get_repo_url(repo['attachments']['uris']['uris'])) f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_undefined_protocol)) repo = json.load(f) self.assertEqual( 'https://svn.blender.org/svnroot/bf-blender/', get_repo_url(repo['attachments']['uris']['uris'])) @requests_mock.Mocker() def test_full_listing(self, http_mocker): fl = self.create_fl_with_db(http_mocker) fl.run() self.assertEqual(fl.db_last_index(), self.last_index) ingested_repos = list(fl.db_query_range(self.first_index, self.last_index)) self.assertEqual(len(ingested_repos), self.entries_per_page) + + @requests_mock.Mocker() + def test_range_listing(self, http_mocker): + fl = self.create_fl_with_db(http_mocker) + + fl.run(max_bound=self.last_index - 1) + + self.assertEqual(fl.db_last_index(), self.last_index - 1) + ingested_repos = list(fl.db_query_range(self.first_index, + self.last_index)) + self.assertEqual(len(ingested_repos), self.entries_per_page - 1)