diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index 5fc750b..68affff 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,157 +1,154 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import random import urllib.parse from swh.lister.core.indexing_lister import IndexingHttpLister from swh.lister.phabricator.models import PhabricatorModel from collections import defaultdict logger = logging.getLogger(__name__) class PhabricatorLister(IndexingHttpLister): PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' + DEFAULT_URL = 'https://forge.softwareheritage.org/api/diffusion.repository.search' # noqa MODEL = PhabricatorModel LISTER_NAME = 'phabricator' - def __init__(self, forge_url, instance=None, override_config=None): - if forge_url.endswith("/"): - forge_url = forge_url[:-1] - self.forge_url = forge_url - api_baseurl = '%s/api/diffusion.repository.search' % forge_url - if not instance: - instance = urllib.parse.urlparse(forge_url).hostname - self.instance = instance + def __init__(self, api_baseurl=None, instance=None, override_config=None): super().__init__(api_baseurl=api_baseurl, override_config=override_config) + if not instance: + instance = urllib.parse.urlparse(self.api_baseurl).hostname + self.instance = instance @property def default_min_bound(self): """Starting boundary when `min_bound` is not defined (db empty). This is used within the fn:`run` call. """ return self._bootstrap_repositories_listing() def request_params(self, identifier): """Override the default params behavior to retrieve the api token Credentials are stored as: credentials: phabricator: : - username: password: """ creds = self.request_instance_credentials() if not creds: raise ValueError( 'Phabricator forge needs authentication credential to list.') api_token = random.choice(creds)['password'] return {'headers': self.request_headers() or {}, 'params': {'api.token': api_token}} def request_headers(self): """ (Override) Set requests headers to send when querying the Phabricator API """ return {'User-Agent': 'Software Heritage phabricator lister', 'Accept': 'application/json'} def get_model_from_repo(self, repo): url = get_repo_url(repo['attachments']['uris']['uris']) if url is None: return None return { - 'uid': self.forge_url + str(repo['id']), + 'uid': url, 'indexable': repo['id'], 'name': repo['fields']['shortName'], 'full_name': repo['fields']['name'], 'html_url': url, 'origin_url': url, 'origin_type': repo['fields']['vcs'], 'instance': self.instance, } def get_next_target_from_response(self, response): body = response.json()['result']['cursor'] if body['after'] != 'null': return body['after'] return None def transport_response_simplified(self, response): repos = response.json() if repos['result'] is None: raise ValueError( 'Problem during information fetch: %s' % repos['error_code']) repos = repos['result']['data'] return [self.get_model_from_repo(repo) for repo in repos] def filter_before_inject(self, models_list): """ (Overrides) IndexingLister.filter_before_inject Bounds query results by this Lister's set max_index. """ models_list = [m for m in models_list if m is not None] return super().filter_before_inject(models_list) def _bootstrap_repositories_listing(self): """ Method called when no min_bound value has been provided to the lister. Its purpose is to: 1. get the first repository data hosted on the Phabricator instance 2. inject them into the lister database 3. return the first repository index to start the listing after that value Returns: int: The first repository index """ params = '&order=oldest&limit=1' response = self.safely_issue_request(params) models_list = self.transport_response_simplified(response) self.max_index = models_list[0]['indexable'] models_list = self.filter_before_inject(models_list) injected = self.inject_repo_data_into_db(models_list) self.schedule_missing_tasks(models_list, injected) return self.max_index def get_repo_url(attachments): """ Return url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id """ processed_urls = defaultdict(dict) for uri in attachments: protocol = uri['fields']['builtin']['protocol'] url = uri['fields']['uri']['effective'] identifier = uri['fields']['builtin']['identifier'] if protocol in ('http', 'https'): processed_urls[protocol][identifier] = url elif protocol is None: for protocol in ('https', 'http'): if url.startswith(protocol): processed_urls[protocol]['undefined'] = url break for protocol in ['https', 'http']: for identifier in ['shortname', 'callsign', 'id', 'undefined']: if (protocol in processed_urls and identifier in processed_urls[protocol]): return processed_urls[protocol][identifier] return None diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index c7e77aa..78cf006 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,61 +1,61 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import json import unittest from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.phabricator.lister import PhabricatorLister from swh.lister.phabricator.lister import get_repo_url class PhabricatorListerTester(HttpListerTester, unittest.TestCase): Lister = PhabricatorLister test_re = re.compile(r'\&after=([^?&]+)') lister_subdir = 'phabricator' good_api_response_file = 'api_response.json' good_api_response_undefined_protocol = 'api_response_undefined_'\ 'protocol.json' bad_api_response_file = 'api_empty_response.json' first_index = 1 last_index = 12 entries_per_page = 10 convert_type = int def get_fl(self, override_config=None): """(Override) Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: credentials = {'phabricator': {'fake': [ {'password': 'toto'} ]}} override_config = dict(credentials=credentials, **(override_config or {})) self.fl = self.Lister( - forge_url='https://fakeurl', instance='fake', + api_baseurl='https://fakeurl', instance='fake', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def test_get_repo_url(self): f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file)) api_response = json.load(f) repos = api_response['result']['data'] for repo in repos: self.assertEqual( 'https://forge.softwareheritage.org/source/%s.git' % (repo['fields']['shortName']), get_repo_url(repo['attachments']['uris']['uris'])) f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_undefined_protocol)) repo = json.load(f) self.assertEqual( 'https://svn.blender.org/svnroot/bf-blender/', get_repo_url(repo['attachments']['uris']['uris'])) diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index b781ecb..df2dda0 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -1,92 +1,94 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.lister.core.lister_base import ListerBase from swh.lister.cli import get_lister, SUPPORTED_LISTERS, DEFAULT_BASEURLS from .test_utils import init_db def test_get_lister_wrong_input(): """Unsupported lister should raise""" with pytest.raises(ValueError) as e: get_lister('unknown', 'db-url') assert "Invalid lister" in str(e.value) def test_get_lister(): """Instantiating a supported lister should be ok """ db_url = init_db().url() supported_listers_with_init = {'npm', 'debian'} supported_listers = set(SUPPORTED_LISTERS) - supported_listers_with_init for lister_name in supported_listers: lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url) assert isinstance(lst, ListerBase) assert drop_fn is None assert init_fn is not None assert insert_data_fn is None for lister_name in supported_listers_with_init: lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url) assert isinstance(lst, ListerBase) assert drop_fn is None assert init_fn is not None assert insert_data_fn is not None for lister_name in supported_listers_with_init: lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url, drop_tables=True) assert isinstance(lst, ListerBase) assert drop_fn is not None assert init_fn is not None assert insert_data_fn is not None def test_get_lister_override(): """Overriding the lister configuration should populate its config """ db_url = init_db().url() listers = { 'gitlab': ('api_baseurl', 'https://gitlab.uni/api/v4/'), - 'phabricator': ('forge_url', 'https://somewhere.org'), + 'phabricator': ( + 'api_baseurl', + 'https://somewhere.org/api/diffusion.repository.search'), } # check the override ends up defined in the lister for lister_name, (url_key, url_value) in listers.items(): lst, drop_fn, init_fn, insert_data_fn = get_lister( lister_name, db_url, **{ 'api_baseurl': url_value, 'priority': 'high', 'policy': 'oneshot', }) assert getattr(lst, url_key) == url_value assert lst.config['priority'] == 'high' assert lst.config['policy'] == 'oneshot' # check the default urls are used and not the override (since it's not # passed) for lister_name, (url_key, url_value) in listers.items(): lst, drop_fn, init_fn, insert_data_fn = get_lister(lister_name, db_url) # no override so this does not end up in lister's configuration assert url_key not in lst.config # then the default base url is used default_url = DEFAULT_BASEURLS[lister_name] assert getattr(lst, url_key) == default_url assert 'priority' not in lst.config assert 'oneshot' not in lst.config