Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/tuleap/lister.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
from urllib.parse import urljoin | from urllib.parse import urljoin | ||||
import iso8601 | import iso8601 | ||||
import requests | |||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
RepoPage = Dict[str, Any] | RepoPage = Dict[str, Any] | ||||
class TuleapLister(StatelessLister[RepoPage]): | class TuleapLister(StatelessLister[RepoPage]): | ||||
Show All 25 Lines | class TuleapLister(StatelessLister[RepoPage]): | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=credentials, | credentials=credentials, | ||||
url=url, | url=url, | ||||
instance=instance, | instance=instance, | ||||
) | ) | ||||
self.session = requests.Session() | self.session.headers.update({"Accept": "application/json"}) | ||||
self.session.headers.update( | |||||
{ | |||||
"Accept": "application/json", | |||||
"User-Agent": USER_AGENT, | |||||
} | |||||
) | |||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: | |||||
logger.info("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
@classmethod | @classmethod | ||||
def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage: | def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage: | ||||
if repo_type == "git": | if repo_type == "git": | ||||
prefix_url = TuleapLister.REPO_GIT_PATH | prefix_url = TuleapLister.REPO_GIT_PATH | ||||
else: | else: | ||||
prefix_url = TuleapLister.REPO_SVN_PATH | prefix_url = TuleapLister.REPO_SVN_PATH | ||||
rep = { | rep = { | ||||
"project": repo["name"], | "project": repo["name"], | ||||
"type": repo_type, | "type": repo_type, | ||||
"uri": urljoin(url, f"{prefix_url}{repo['path']}"), | "uri": urljoin(url, f"{prefix_url}{repo['path']}"), | ||||
"last_update_date": repo["last_update_date"], | "last_update_date": repo["last_update_date"], | ||||
} | } | ||||
return rep | return rep | ||||
def _get_repositories(self, url_repo) -> List[Dict[str, Any]]: | def _get_repositories(self, url_repo) -> List[Dict[str, Any]]: | ||||
ret = self.page_request(url_repo, {}) | ret = self.http_request(url_repo) | ||||
reps_list = ret.json()["repositories"] | reps_list = ret.json()["repositories"] | ||||
limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) | limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) | ||||
offset = int(ret.headers["X-PAGINATION-LIMIT"]) | offset = int(ret.headers["X-PAGINATION-LIMIT"]) | ||||
size = int(ret.headers["X-PAGINATION-SIZE"]) | size = int(ret.headers["X-PAGINATION-SIZE"]) | ||||
while offset < size: | while offset < size: | ||||
url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit) | url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit) | ||||
ret = self.page_request(url_offset, {}).json() | ret = self.http_request(url_offset).json() | ||||
reps_list = reps_list + ret["repositories"] | reps_list = reps_list + ret["repositories"] | ||||
offset += limit | offset += limit | ||||
return reps_list | return reps_list | ||||
def get_pages(self) -> Iterator[RepoPage]: | def get_pages(self) -> Iterator[RepoPage]: | ||||
# base with trailing slash, path without leading slash for urljoin | # base with trailing slash, path without leading slash for urljoin | ||||
url_api: str = urljoin(self.url, self.REPO_LIST_PATH) | url_api: str = urljoin(self.url, self.REPO_LIST_PATH) | ||||
url_projects = url_api + "/projects/" | url_projects = url_api + "/projects/" | ||||
# Get the list of projects. | # Get the list of projects. | ||||
response = self.page_request(url_projects, {}) | response = self.http_request(url_projects) | ||||
projects_list = response.json() | projects_list = response.json() | ||||
limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) | limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) | ||||
offset = int(response.headers["X-PAGINATION-LIMIT"]) | offset = int(response.headers["X-PAGINATION-LIMIT"]) | ||||
size = int(response.headers["X-PAGINATION-SIZE"]) | size = int(response.headers["X-PAGINATION-SIZE"]) | ||||
while offset < size: | while offset < size: | ||||
url_offset = ( | url_offset = ( | ||||
url_projects + "?offset=" + str(offset) + "&limit=" + str(limit) | url_projects + "?offset=" + str(offset) + "&limit=" + str(limit) | ||||
) | ) | ||||
ret = self.page_request(url_offset, {}).json() | ret = self.http_request(url_offset).json() | ||||
projects_list = projects_list + ret | projects_list = projects_list + ret | ||||
offset += limit | offset += limit | ||||
# Get list of repositories for each project. | # Get list of repositories for each project. | ||||
for p in projects_list: | for p in projects_list: | ||||
p_id = p["id"] | p_id = p["id"] | ||||
# Fetch Git repositories for project | # Fetch Git repositories for project | ||||
Show All 15 Lines |