diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index 5c040e3..193239c 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,181 +1,182 @@ # Copyright (C) 2019-2020 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import logging import random from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import requests from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) PageType = List[Dict[str, Any]] class PhabricatorLister(StatelessLister[PageType]): """ List all repositories hosted on a Phabricator instance. Args: url: base URL of a phabricator forge (for instance https://forge.softwareheritage.org) instance: string identifier for the listed forge api_token: authentication token for Conduit API """ LISTER_NAME = "phabricator" API_REPOSITORY_PATH = "/api/diffusion.repository.search" def __init__( self, scheduler: SchedulerInterface, url: str, instance: str, api_token: Optional[str] = None, credentials: CredentialsType = None, ): super().__init__( scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) if api_token is not None: self.api_token = api_token else: if not self.credentials: raise ValueError( f"No credentials found for phabricator instance {self.instance};" " Please set them in the lister configuration file." ) self.api_token = random.choice(self.credentials)["password"] def get_request_params(self, after: Optional[str]) -> Dict[str, str]: """Get the query parameters for the request.""" base_params = { # Stable order "order": "oldest", # Add all URIs to the response "attachments[uris]": "1", # API token from stored credentials "api.token": self.api_token, } if after is not None: base_params["after"] = after return base_params @staticmethod def filter_params(params: Dict[str, str]) -> Dict[str, str]: """Filter the parameters for debug purposes""" return { k: (v if k != "api.token" else "**redacted**") for k, v in params.items() } def get_pages(self) -> Iterator[PageType]: after: Optional[str] = None while True: params = self.get_request_params(after) logger.debug( - "Retrieving results on URI=%s, parameters %s", + "Retrieving results on URI %s with parameters %s", self.url, self.filter_params(params), ) response = self.session.post(self.url, data=params) if response.status_code != 200: logger.warning( - "Got unexpected status_code %s on %s: %s", + "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) - break + + response.raise_for_status() response_data = response.json() if response_data.get("result") is None: logger.warning( "Got unexpected response on %s: %s", response.url, response_data, ) break result = response_data["result"] yield result["data"] after = None if "cursor" in result and "after" in result["cursor"]: after = result["cursor"]["after"] if not after: logger.debug("Empty `after` cursor. All done") break def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for repo in page: url = get_repo_url(repo["attachments"]["uris"]["uris"]) if url is None: short_name: Optional[str] = None for field in "shortName", "name", "callsign": short_name = repo["fields"].get(field) if short_name: break logger.warning( "No valid url for repository [%s] (phid=%s)", short_name or repo["phid"], repo["phid"], ) continue yield ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=repo["fields"]["vcs"], # The "dateUpdated" field returned by the Phabricator API only refers to # the repository metadata; We can't use it for our purposes. last_update=None, ) def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[str]: """ Return url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id """ processed_urls = defaultdict(dict) # type: Dict[str, Any] for uri in attachments: protocol = uri["fields"]["builtin"]["protocol"] url = uri["fields"]["uri"]["effective"] identifier = uri["fields"]["builtin"]["identifier"] if protocol in ("http", "https"): processed_urls[protocol][identifier] = url elif protocol is None: for protocol in ("https", "http"): if url.startswith(protocol): processed_urls[protocol]["undefined"] = url break for protocol in ["https", "http"]: for identifier in ["shortname", "callsign", "id", "undefined"]: if protocol in processed_urls and identifier in processed_urls[protocol]: return processed_urls[protocol][identifier] return None diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index 42218e4..496a2a2 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,113 +1,135 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path import pytest +from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.phabricator.lister import PhabricatorLister, get_repo_url @pytest.fixture def phabricator_repositories_page1(datadir): return json.loads( Path(datadir, "phabricator_api_repositories_page1.json").read_text() ) @pytest.fixture def phabricator_repositories_page2(datadir): return json.loads( Path(datadir, "phabricator_api_repositories_page2.json").read_text() ) def test_get_repo_url(phabricator_repositories_page1): repos = phabricator_repositories_page1["result"]["data"] for repo in repos: expected_name = "https://forge.softwareheritage.org/source/%s.git" % ( repo["fields"]["shortName"] ) assert get_repo_url(repo["attachments"]["uris"]["uris"]) == expected_name def test_get_repo_url_undefined_protocol(): undefined_protocol_uris = [ { "fields": { "uri": { "raw": "https://svn.blender.org/svnroot/bf-blender/", "display": "https://svn.blender.org/svnroot/bf-blender/", "effective": "https://svn.blender.org/svnroot/bf-blender/", "normalized": "svn.blender.org/svnroot/bf-blender", }, "builtin": {"protocol": None, "identifier": None}, }, } ] expected_name = "https://svn.blender.org/svnroot/bf-blender/" assert get_repo_url(undefined_protocol_uris) == expected_name def test_lister_url_param(swh_scheduler): FORGE_BASE_URL = "https://forge.softwareheritage.org" API_REPOSITORY_PATH = "/api/diffusion.repository.search" for url in ( FORGE_BASE_URL, f"{FORGE_BASE_URL}/", f"{FORGE_BASE_URL}/{API_REPOSITORY_PATH}", f"{FORGE_BASE_URL}/{API_REPOSITORY_PATH}/", ): lister = PhabricatorLister( scheduler=swh_scheduler, url=FORGE_BASE_URL, instance="swh", api_token="foo" ) expected_url = f"{FORGE_BASE_URL}{API_REPOSITORY_PATH}" assert lister.url == expected_url def test_lister( swh_scheduler, requests_mock, phabricator_repositories_page1, phabricator_repositories_page2, ): FORGE_BASE_URL = "https://forge.softwareheritage.org" API_TOKEN = "foo" lister = PhabricatorLister( scheduler=swh_scheduler, url=FORGE_BASE_URL, instance="swh", api_token=API_TOKEN ) def match_request(request): return ( request.headers.get("User-Agent") == USER_AGENT and f"api.token={API_TOKEN}" in request.body ) requests_mock.post( f"{FORGE_BASE_URL}{lister.API_REPOSITORY_PATH}", [ {"json": phabricator_repositories_page1}, {"json": phabricator_repositories_page2}, ], additional_matcher=match_request, ) stats = lister.run() expected_nb_origins = len(phabricator_repositories_page1["result"]["data"]) * 2 assert stats.pages == 2 assert stats.origins == expected_nb_origins scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins assert len(scheduler_origins) == expected_nb_origins + + +def test_lister_request_error( + swh_scheduler, requests_mock, phabricator_repositories_page1, +): + FORGE_BASE_URL = "https://forge.softwareheritage.org" + + lister = PhabricatorLister( + scheduler=swh_scheduler, url=FORGE_BASE_URL, instance="swh", api_token="foo" + ) + + requests_mock.post( + f"{FORGE_BASE_URL}{lister.API_REPOSITORY_PATH}", + [ + {"status_code": 200, "json": phabricator_repositories_page1}, + {"status_code": 500, "reason": "Internal Server Error"}, + ], + ) + + with pytest.raises(HTTPError): + lister.run()