Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/tuleap/lister.py
- This file was added.
Property | Old Value | New Value |
---|---|---|
File Mode | null | 100755 |
# Copyright (C) 2018-2021 The Software Heritage developers | ||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||
import logging | ||||||||||||||||
import random | ||||||||||||||||
from typing import Any, Dict, Iterator, Optional | ||||||||||||||||
from urllib.parse import urljoin | ||||||||||||||||
import iso8601 | ||||||||||||||||
import requests | ||||||||||||||||
from tenacity.before_sleep import before_sleep_log | ||||||||||||||||
from urllib3.util import parse_url | ||||||||||||||||
from swh.lister.utils import throttling_retry | ||||||||||||||||
from swh.scheduler.interface import SchedulerInterface | ||||||||||||||||
from swh.scheduler.model import ListedOrigin | ||||||||||||||||
from .. import USER_AGENT | ||||||||||||||||
from ..pattern import CredentialsType, StatelessLister | ||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
RepoPage = Dict[str, Any] | ||||||||||||||||
class TuleapLister(StatelessLister[RepoPage]): | ||||||||||||||||
"""List origins from Tuleap. | ||||||||||||||||
Tuleap provides SVN and Git repositories hosting. | ||||||||||||||||
Tuleap API getting started: | ||||||||||||||||
https://tuleap.net/doc/en/user-guide/integration/rest.html | ||||||||||||||||
Tuleap API reference: | ||||||||||||||||
https://tuleap.net/api/explorer/ | ||||||||||||||||
Using the API we first request a list of projects, and from there request their | ||||||||||||||||
associated repositories individually. Everything is paginated, code uses throttling | ||||||||||||||||
at the individual | ||||||||||||||||
GET call level.""" | ||||||||||||||||
LISTER_NAME = "tuleap" | ||||||||||||||||
REPO_LIST_PATH = "/api" | ||||||||||||||||
REPO_GIT_PATH = "plugins/git/" | ||||||||||||||||
REPO_SVN_PATH = "plugins/svn/" | ||||||||||||||||
def __init__( | ||||||||||||||||
self, | ||||||||||||||||
scheduler: SchedulerInterface, | ||||||||||||||||
url: str, | ||||||||||||||||
instance: Optional[str] = None, | ||||||||||||||||
api_token: Optional[str] = None, | ||||||||||||||||
credentials: CredentialsType = None, | ||||||||||||||||
): | ||||||||||||||||
if instance is None: | ||||||||||||||||
instance = parse_url(url).host | ||||||||||||||||
super().__init__( | ||||||||||||||||
scheduler=scheduler, credentials=credentials, url=url, instance=instance, | ||||||||||||||||
) | ||||||||||||||||
self.session = requests.Session() | ||||||||||||||||
self.session.headers.update( | ||||||||||||||||
{"Accept": "application/json", "User-Agent": USER_AGENT,} | ||||||||||||||||
) | ||||||||||||||||
# DEV: authentication not yet available. | ||||||||||||||||
if api_token is None: | ||||||||||||||||
if len(self.credentials) > 0: | ||||||||||||||||
cred = random.choice(self.credentials) | ||||||||||||||||
username = cred.get("username") | ||||||||||||||||
api_token = cred["password"] | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Using authentication token from user %s", username or "???" | ||||||||||||||||
) | ||||||||||||||||
else: | ||||||||||||||||
logger.warning( | ||||||||||||||||
"No authentication token set in configuration, using anonymous mode" | ||||||||||||||||
) | ||||||||||||||||
if api_token: | ||||||||||||||||
ardumont: I recall they describe the authentication per token deprecated.
And they mention something… | ||||||||||||||||
self.session.headers["Authorization"] = "Token %s" % api_token | ||||||||||||||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | ||||||||||||||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: | ||||||||||||||||
logger.info("Fetching URL %s with params %s", url, params) | ||||||||||||||||
response = self.session.get(url, params=params) | ||||||||||||||||
if response.status_code != 200: | ||||||||||||||||
logger.warning( | ||||||||||||||||
"Unexpected HTTP status code %s on %s: %s", | ||||||||||||||||
response.status_code, | ||||||||||||||||
response.url, | ||||||||||||||||
response.content, | ||||||||||||||||
) | ||||||||||||||||
response.raise_for_status() | ||||||||||||||||
return response | ||||||||||||||||
@classmethod | ||||||||||||||||
def results_simplified(cls, url: str, rtype: str, repo: RepoPage) -> RepoPage: | ||||||||||||||||
# XXX remove project param | ||||||||||||||||
rep = { | ||||||||||||||||
"project": repo["name"], | ||||||||||||||||
"type": rtype, | ||||||||||||||||
"uri": urljoin(url, "plugins/git/" + repo["path"]), | ||||||||||||||||
"last_update_date": repo["last_update_date"], | ||||||||||||||||
Done Inline Actions
If i understood the rtype correctly that is. ardumont: If i understood the rtype correctly that is. | ||||||||||||||||
Done Inline ActionsYou did understand well. :-) borisbaldassari: You did understand well. :-)
The svn part is not implemented yet, since I cannot test it - but… | ||||||||||||||||
Done Inline ActionsOn second thougths you almoost got it: the method is a classmethod, so self won't work. B borisbaldassari: On second thougths you almoost got it: the method is a classmethod, so self won't work. B | ||||||||||||||||
} | ||||||||||||||||
return rep | ||||||||||||||||
Done Inline Actionsplease add type annotation, this helps reading the code ;) ardumont: please add type annotation, this helps reading the code ;) | ||||||||||||||||
def _get_repositories(self, rurl): | ||||||||||||||||
ret = self.page_request(rurl, {}) | ||||||||||||||||
reps_list = ret.json()["repositories"] | ||||||||||||||||
limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) | ||||||||||||||||
offset = int(ret.headers["X-PAGINATION-LIMIT"]) | ||||||||||||||||
size = int(ret.headers["X-PAGINATION-SIZE"]) | ||||||||||||||||
while offset < size: | ||||||||||||||||
wurl = rurl + "?offset=" + str(offset) + "&limit=" + str(limit) | ||||||||||||||||
ret = self.page_request(wurl).json() | ||||||||||||||||
reps_list = reps_list + ret["repositories"] | ||||||||||||||||
offset += limit | ||||||||||||||||
return reps_list | ||||||||||||||||
def get_pages(self) -> Iterator[RepoPage]: | ||||||||||||||||
# base with trailing slash, path without leading slash for urljoin | ||||||||||||||||
url: str = urljoin(self.url, self.REPO_LIST_PATH) | ||||||||||||||||
purl = url + "/projects/" | ||||||||||||||||
# Get the list of projects. | ||||||||||||||||
print("* Fetching url", purl) | ||||||||||||||||
response = self.page_request(purl, {}) | ||||||||||||||||
projects_list = response.json() | ||||||||||||||||
limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) | ||||||||||||||||
Not Done Inline Actions
Might as well make those variable meaningful, don't really get the initial gurl term ¯\_(ツ)_/¯ (please, rename to something clearer or righter if i'm wrong heh ;) ardumont: Might as well make those variable meaningful, don't really get the initial `gurl` term ¯\_… | ||||||||||||||||
Done Inline ActionsYou're definitely right. I've renamed almost all variables to be more meaningful, and did some code cleaning. borisbaldassari: You're definitely right. I've renamed almost all variables to be more meaningful, and did some… | ||||||||||||||||
offset = int(response.headers["X-PAGINATION-LIMIT"]) | ||||||||||||||||
size = int(response.headers["X-PAGINATION-SIZE"]) | ||||||||||||||||
while offset < size: | ||||||||||||||||
wurl = purl + "?offset=" + str(offset) + "&limit=" + str(limit) | ||||||||||||||||
print(" Fetching wurl", wurl) | ||||||||||||||||
ret = self.page_request(wurl, {}).json() | ||||||||||||||||
projects_list = projects_list + ret | ||||||||||||||||
offset += limit | ||||||||||||||||
# Get list of repositories for each project. | ||||||||||||||||
for p in projects_list: | ||||||||||||||||
p_id = p["id"] | ||||||||||||||||
# Fetch Git repositories for project | ||||||||||||||||
gurl = url + "/projects/" + str(p_id) + "/git" | ||||||||||||||||
reps = self._get_repositories(gurl) | ||||||||||||||||
for r in reps: | ||||||||||||||||
rep = self.results_simplified(url, "git", r) | ||||||||||||||||
yield rep | ||||||||||||||||
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: | ||||||||||||||||
"""Convert a page of Tuleap repositories into a list of ListedOrigins. | ||||||||||||||||
""" | ||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||
yield ListedOrigin( | ||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||
url=page["uri"], | ||||||||||||||||
visit_type=page["type"], | ||||||||||||||||
last_update=iso8601.parse_date(page["last_update_date"]), | ||||||||||||||||
) |
I recall they describe the authentication per token deprecated.
And they mention something about custom headers [2].
So if it's indeed not the right implementation [3], it'd be fine if you provide a tuleap
lister implementation relying only on anonymous listing first. That means, you can drop
that credentials initialization part, but keep the credentials parameter in the
constructor (and in the super call) though (implementation detail which requires it).
We (including you) can always dig in later to improve the implementation so we can
actually connect to the tuleap instance (if that's needed, i don't know at which point
the anonymous listing is restricted).
[1] https://tuleap.net/doc/en/user-guide/integration/rest/quick-start/auth.html
[2] custom headers mentioned:
[3] it's better if new code is tested. You can rely on the jenkins build to detect
what's amiss