diff --git a/swh/lister/phabricator/__init__.py b/swh/lister/phabricator/__init__.py --- a/swh/lister/phabricator/__init__.py +++ b/swh/lister/phabricator/__init__.py @@ -5,10 +5,9 @@ def register(): from .lister import PhabricatorLister - from .models import PhabricatorModel return { - "models": [PhabricatorModel], + "models": [], "lister": PhabricatorLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,166 +1,147 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2020 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - from collections import defaultdict import logging import random -from typing import Any, Dict, List, Optional -import urllib.parse +from typing import Any, Dict, Iterator, List, Optional -from requests import Response -from sqlalchemy import func +import requests -from swh.lister.core.indexing_lister import IndexingHttpLister -from swh.lister.phabricator.models import PhabricatorModel +from swh.lister import USER_AGENT +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) -class PhabricatorLister(IndexingHttpLister): - PATH_TEMPLATE = "?order=oldest&attachments[uris]=1&after=%s" - DEFAULT_URL = "https://forge.softwareheritage.org/api/diffusion.repository.search" - MODEL = PhabricatorModel - LISTER_NAME = "phabricator" +PageType = List[Dict[str, Any]] - def __init__(self, url=None, instance=None, override_config=None): - super().__init__(url=url, override_config=override_config) - if not instance: - instance = urllib.parse.urlparse(self.url).hostname - self.instance = instance - def request_params(self, identifier: str) -> Dict[str, Any]: - """Override the default params behavior to retrieve the api token +class PhabricatorLister(StatelessLister[PageType]): + LISTER_NAME = "phabricator" - Credentials are stored as: + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: str, + api_token: Optional[str] = None, + credentials: CredentialsType = None, + ): + super().__init__(scheduler, url, instance, credentials) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + if api_token is not None: + self.api_token = api_token + else: + if not self.credentials: + raise ValueError( + f"No credentials found for phabricator instance {self.instance};" + " Please set them in the lister configuration file." + ) + + self.api_token = random.choice(self.credentials)["password"] + + def get_request_params(self, after: Optional[str]) -> Dict[str, str]: + """Get the query parameters for the request.""" + + base_params = { + # Stable order + "order": "oldest", + # Add all URIs to the response + "attachments[uris]": "1", + # API token from stored credentials + "api.token": self.api_token, + } - credentials: - phabricator: - : - - username: - password: + if after is not None: + base_params["after"] = after - """ - creds = self.request_instance_credentials() - if not creds: - raise ValueError( - "Phabricator forge needs authentication credential to list." - ) - api_token = random.choice(creds)["password"] + return base_params + @staticmethod + def filter_params(params: Dict[str, str]) -> Dict[str, str]: + """Filter the parameters for debug purposes""" return { - "headers": self.request_headers() or {}, - "params": {"api.token": api_token}, + k: (v if k != "api.token" else "**redacted**") for k, v in params.items() } - def request_headers(self): - """ - (Override) Set requests headers to send when querying the - Phabricator API - """ - headers = super().request_headers() - headers["Accept"] = "application/json" - return headers - - def get_model_from_repo(self, repo: Dict[str, Any]) -> Optional[Dict[str, Any]]: - url = get_repo_url(repo["attachments"]["uris"]["uris"]) - if url is None: - return None - return { - "uid": url, - "indexable": repo["id"], - "name": repo["fields"]["shortName"], - "full_name": repo["fields"]["name"], - "html_url": url, - "origin_url": url, - "origin_type": repo["fields"]["vcs"], - "instance": self.instance, - } + def get_pages(self) -> Iterator[PageType]: + after: Optional[str] = None + while True: + params = self.get_request_params(after) + logger.debug( + "Retrieving results on URI=%s, parameters %s", + self.url, + self.filter_params(params), + ) + response = self.session.post(self.url, data=params) + + if response.status_code != 200: + logger.warning( + "Got unexpected status_code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + break - def get_next_target_from_response(self, response: Response) -> Optional[int]: - body = response.json()["result"]["cursor"] - if body["after"] and body["after"] != "null": - return int(body["after"]) - return None - - def transport_response_simplified( - self, response: Response - ) -> List[Optional[Dict[str, Any]]]: - repos = response.json() - if repos["result"] is None: - raise ValueError( - "Problem during information fetch: %s" % repos["error_code"] + response_data = response.json() + + if response_data.get("result") is None: + logger.warning( + "Got unexpected response on %s: %s", response.url, response_data, + ) + break + + result = response_data["result"] + + yield result["data"] + after = None + if "cursor" in result and "after" in result["cursor"]: + after = result["cursor"]["after"] + + if not after: + logger.debug("Empty `after` cursor. All done") + break + + def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: + assert self.lister_obj.id is not None + + for repo in page: + url = get_repo_url(repo["attachments"]["uris"]["uris"]) + if url is None: + short_name: Optional[str] = None + + for field in "shortName", "name", "callsign": + short_name = repo["fields"].get(field) + if short_name: + break + + logger.warning( + "No valid url for repository [%s] (phid=%s)", + short_name or repo["phid"], + repo["phid"], + ) + continue + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=url, + visit_type=repo["fields"]["vcs"], + # The "dateUpdated" field returned by the Phabricator API only refers to + # the repository metadata; We can't use it for our purposes. + last_update=None, ) - repos = repos["result"]["data"] - return [self.get_model_from_repo(repo) for repo in repos] - - def filter_before_inject(self, models_list): - """ - (Overrides) IndexingLister.filter_before_inject - Bounds query results by this Lister's set max_index. - """ - models_list = [m for m in models_list if m is not None] - return super().filter_before_inject(models_list) - - def disable_deleted_repo_tasks(self, index: int, next_index: int, keep_these: str): - """ - (Overrides) Fix provided index value to avoid: - - - database query error - - erroneously disabling some scheduler tasks - """ - # First call to the Phabricator API uses an empty 'after' parameter, - # so set the index to 0 to avoid database query error - if index == "": - index = 0 - # Next listed repository ids are strictly greater than the 'after' - # parameter, so increment the index to avoid disabling the latest - # created task when processing a new repositories page returned by - # the Phabricator API - else: - index = index + 1 - return super().disable_deleted_repo_tasks(index, next_index, keep_these) - - def db_first_index(self) -> Optional[int]: - """ - (Overrides) Filter results by Phabricator instance - - Returns: - the smallest indexable value of all repos in the db - """ - t = self.db_session.query(func.min(self.MODEL.indexable)) - t = t.filter(self.MODEL.instance == self.instance).first() - if t: - return t[0] - return None - - def db_last_index(self): - """ - (Overrides) Filter results by Phabricator instance - - Returns: - the largest indexable value of all instance repos in the db - """ - t = self.db_session.query(func.max(self.MODEL.indexable)) - t = t.filter(self.MODEL.instance == self.instance).first() - if t: - return t[0] - - def db_query_range(self, start: int, end: int): - """ - (Overrides) Filter the results by the Phabricator instance to - avoid disabling loading tasks for repositories hosted on a - different instance. - - Returns: - a list of sqlalchemy.ext.declarative.declarative_base objects - with indexable values within the given range for the instance - """ - retlist = super().db_query_range(start, end) - return retlist.filter(self.MODEL.instance == self.instance) - - -def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[int]: + + +def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[str]: """ Return url for a hosted repository from its uris attachments according to the following priority lists: diff --git a/swh/lister/phabricator/models.py b/swh/lister/phabricator/models.py deleted file mode 100644 --- a/swh/lister/phabricator/models.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, Integer, String - -from swh.lister.core.models import IndexingModelBase - - -class PhabricatorModel(IndexingModelBase): - """a Phabricator repository""" - - __tablename__ = "phabricator_repo" - - uid = Column(String, primary_key=True) - indexable = Column(Integer, index=True) - instance = Column(String, index=True) diff --git a/swh/lister/phabricator/tasks.py b/swh/lister/phabricator/tasks.py --- a/swh/lister/phabricator/tasks.py +++ b/swh/lister/phabricator/tasks.py @@ -8,9 +8,9 @@ @shared_task(name=__name__ + ".FullPhabricatorLister") -def list_phabricator_full(**lister_args): +def list_phabricator_full(url: str, instance: str): """Full update of a Phabricator instance""" - return PhabricatorLister(**lister_args).run() + return PhabricatorLister.from_configfile(url=url, instance=instance).run() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/phabricator/tests/data/__init__.py b/swh/lister/phabricator/tests/data/__init__.py new file mode 100644 diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -3,131 +3,27 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import importlib.resources import json -import logging -import re -import unittest -import requests_mock +from swh.lister.phabricator.lister import get_repo_url -from swh.lister.core.tests.test_lister import HttpListerTester -from swh.lister.phabricator.lister import PhabricatorLister, get_repo_url -logger = logging.getLogger(__name__) - - -class PhabricatorListerTester(HttpListerTester, unittest.TestCase): - Lister = PhabricatorLister - # first request will have the after parameter empty - test_re = re.compile(r"\&after=([^?&]*)") - lister_subdir = "phabricator" - good_api_response_file = "data/api_first_response.json" - good_api_response_undefined_protocol = "data/api_response_undefined_protocol.json" - bad_api_response_file = "data/api_empty_response.json" - # first_index must be retrieved through a bootstrap process for Phabricator - first_index = None - last_index = 12 - entries_per_page = 10 - - convert_type = int - - def request_index(self, request): - """(Override) This is needed to emulate the listing bootstrap - when no min_bound is provided to run - """ - m = self.test_re.search(request.path_url) - idx = m.group(1) - if idx not in ("", "None"): - return int(idx) - - def get_fl(self, override_config=None): - """(Override) Retrieve an instance of fake lister (fl). - - """ - if override_config or self.fl is None: - credentials = {"phabricator": {"fake": [{"password": "toto"}]}} - override_config = dict(credentials=credentials, **(override_config or {})) - self.fl = self.Lister( - url="https://fakeurl", instance="fake", override_config=override_config - ) - self.fl.INITIAL_BACKOFF = 1 - - self.fl.reset_backoff() - return self.fl - - def test_get_repo_url(self): - f = open( - "swh/lister/%s/tests/%s" % (self.lister_subdir, self.good_api_response_file) - ) +def test_get_repo_url(): + with importlib.resources.open_text( + "swh.lister.phabricator.tests.data", "api_first_response.json" + ) as f: api_response = json.load(f) - repos = api_response["result"]["data"] - for repo in repos: - self.assertEqual( - "https://forge.softwareheritage.org/source/%s.git" - % (repo["fields"]["shortName"]), - get_repo_url(repo["attachments"]["uris"]["uris"]), - ) - - f = open( - "swh/lister/%s/tests/%s" - % (self.lister_subdir, self.good_api_response_undefined_protocol) + repos = api_response["result"]["data"] + for repo in repos: + expected_name = "https://forge.softwareheritage.org/source/%s.git" % ( + repo["fields"]["shortName"] ) - repo = json.load(f) - self.assertEqual( - "https://svn.blender.org/svnroot/bf-blender/", - get_repo_url(repo["attachments"]["uris"]["uris"]), - ) - - @requests_mock.Mocker() - def test_scheduled_tasks(self, http_mocker): - self.scheduled_tasks_test("data/api_next_response.json", 23, http_mocker) + assert get_repo_url(repo["attachments"]["uris"]["uris"]) == expected_name - @requests_mock.Mocker() - def test_scheduled_tasks_multiple_instances(self, http_mocker): - - fl = self.create_fl_with_db(http_mocker) - - # list first Phabricator instance - fl.run() - - fl.instance = "other_fake" - fl.config["credentials"] = { - "phabricator": {"other_fake": [{"password": "foo"}]} - } - - # list second Phabricator instance hosting repositories having - # same ids as those listed from the first instance - self.good_api_response_file = "data/api_first_response_other_instance.json" - self.last_index = 13 - fl.run() - - # check expected number of loading tasks - self.assertEqual(len(self.scheduler_tasks), 2 * self.entries_per_page) - - # check tasks are not disabled - for task in self.scheduler_tasks: - self.assertTrue(task["status"] != "disabled") - - -def test_phabricator_lister(lister_phabricator, requests_mock_datadir): - lister = lister_phabricator - assert lister.url == lister.DEFAULT_URL - assert lister.instance == "forge.softwareheritage.org" - lister.run() - - r = lister.scheduler.search_tasks(task_type="load-git") - assert len(r) == 10 - - for row in r: - assert row["type"] == "load-git" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 - - # kwargs - kwargs = row["arguments"]["kwargs"] - url = kwargs["url"] - assert lister.instance in url - - assert row["policy"] == "recurring" - assert row["priority"] is None + with importlib.resources.open_text( + "swh.lister.phabricator.tests.data", "api_response_undefined_protocol.json", + ) as f: + repo = json.load(f) + expected_name = "https://svn.blender.org/svnroot/bf-blender/" + assert get_repo_url(repo["attachments"]["uris"]["uris"]) == expected_name diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -9,6 +9,14 @@ from .test_utils import init_db +lister_args = { + "phabricator": { + "instance": "softwareheritage", + "url": "https://forge.softwareheritage.org/api/diffusion.repository.search", + "api_token": "bogus", + }, +} + def test_get_lister_wrong_input(): """Unsupported lister should raise""" @@ -25,7 +33,10 @@ db_url = init_db().url() for lister_name in SUPPORTED_LISTERS: lst = get_lister( - lister_name, db_url, scheduler={"cls": "local", **swh_scheduler_config} + lister_name, + db_url, + scheduler={"cls": "local", **swh_scheduler_config}, + **lister_args.get(lister_name, {}), ) assert hasattr(lst, "run") @@ -38,7 +49,6 @@ listers = { "gitlab": "https://other.gitlab.uni/api/v4/", - "phabricator": "https://somewhere.org/api/diffusion.repository.search", "cgit": "https://some.where/cgit", }