diff --git a/CONTRIBUTORS b/CONTRIBUTORS --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -3,4 +3,5 @@ Léni Gauffier Yann Gautier Sushant Sushant -Hezekiah Maina \ No newline at end of file +Hezekiah Maina +Boris Baldassari diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` +- `swh.lister.tuleap` Dependencies ------------ @@ -35,7 +36,7 @@ ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`) +`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). diff --git a/docs/tutorial.rst b/docs/tutorial.rst --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -342,6 +342,25 @@ parameters if the service supports it, and by extracting from the response only the information needed into a structured page. This all makes for easier debugging. +Misc files +^^^^^^^^^^^^^^^ + +There are also a few files that need to be modified outside of the lister directory, namely: + +* `/setup.py` to add your lister to the end of the list in the *setup* section: + + entry_points=""" + [swh.cli.subcommands] + lister=swh.lister.cli + [swh.workers] + lister.bitbucket=swh.lister.bitbucket:register + lister.cgit=swh.lister.cgit:register + ...""" + +* `/swh/lister/tests/test_cli.py` to get a default set of parameters in scheduler-related tests. +* `/README.md` to reference the new lister. +* `/CONTRIBUTORS` to add your name. + Testing your lister ------------------- diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -69,6 +69,7 @@ lister.phabricator=swh.lister.phabricator:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register + lister.tuleap=swh.lister.tuleap:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -15,6 +15,7 @@ "api_token": "bogus", }, "gitea": {"url": "https://try.gitea.io/api/v1/",}, + "tuleap": {"url": "https://tuleap.net",}, "gitlab": {"url": "https://gitlab.ow2.org/api/v4", "instance": "ow2",}, } diff --git a/swh/lister/tuleap/__init__.py b/swh/lister/tuleap/__init__.py new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2020 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import TuleapLister + + return { + "lister": TuleapLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/lister.py @@ -0,0 +1,150 @@ +# Copyright (C) 2018-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import urljoin + +import iso8601 +import requests +from tenacity.before_sleep import before_sleep_log +from urllib3.util import parse_url + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +RepoPage = Dict[str, Any] + + +class TuleapLister(StatelessLister[RepoPage]): + """List origins from Tuleap. + + Tuleap provides SVN and Git repositories hosting. + + Tuleap API getting started: + https://tuleap.net/doc/en/user-guide/integration/rest.html + Tuleap API reference: + https://tuleap.net/api/explorer/ + + Using the API we first request a list of projects, and from there request their + associated repositories individually. Everything is paginated, code uses throttling + at the individual GET call level.""" + + LISTER_NAME = "tuleap" + + REPO_LIST_PATH = "/api" + REPO_GIT_PATH = "plugins/git/" + REPO_SVN_PATH = "plugins/svn/" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: Optional[str] = None, + credentials: CredentialsType = None, + ): + if instance is None: + instance = parse_url(url).host + + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT,} + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + @classmethod + def results_simplified(cls, url: str, rtype: str, repo: RepoPage) -> RepoPage: + if rtype == "git": + prefix_url = TuleapLister.REPO_GIT_PATH + else: + prefix_url = TuleapLister.REPO_SVN_PATH + rep = { + "project": repo["name"], + "type": rtype, + "uri": urljoin(url, f"{prefix_url}{repo['path']}"), + "last_update_date": repo["last_update_date"], + } + return rep + + def _get_repositories(self, rurl) -> List[Dict[str, Any]]: + ret = self.page_request(rurl, {}) + reps_list = ret.json()["repositories"] + limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) + offset = int(ret.headers["X-PAGINATION-LIMIT"]) + size = int(ret.headers["X-PAGINATION-SIZE"]) + while offset < size: + wurl = rurl + "?offset=" + str(offset) + "&limit=" + str(limit) + ret = self.page_request(wurl).json() + reps_list = reps_list + ret["repositories"] + offset += limit + return reps_list + + def get_pages(self) -> Iterator[RepoPage]: + # base with trailing slash, path without leading slash for urljoin + url: str = urljoin(self.url, self.REPO_LIST_PATH) + purl = url + "/projects/" + + # Get the list of projects. + response = self.page_request(purl, {}) + projects_list = response.json() + limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) + offset = int(response.headers["X-PAGINATION-LIMIT"]) + size = int(response.headers["X-PAGINATION-SIZE"]) + while offset < size: + wurl = purl + "?offset=" + str(offset) + "&limit=" + str(limit) + ret = self.page_request(wurl, {}).json() + projects_list = projects_list + ret + offset += limit + + # Get list of repositories for each project. + for p in projects_list: + p_id = p["id"] + + # Fetch Git repositories for project + gurl = url + "/projects/" + str(p_id) + "/git" + reps = self._get_repositories(gurl) + for r in reps: + rep = self.results_simplified(url, "git", r) + yield rep + + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Tuleap repositories into a list of ListedOrigins. + + """ + assert self.lister_obj.id is not None + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=page["uri"], + visit_type=page["type"], + last_update=iso8601.parse_date(page["last_update_date"]), + ) diff --git a/swh/lister/tuleap/tasks.py b/swh/lister/tuleap/tasks.py new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/tasks.py @@ -0,0 +1,21 @@ +# Copyright (C) 2020 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import TuleapLister + + +@shared_task(name=__name__ + ".FullTuleapLister") +def list_tuleap_full(**lister_args) -> Dict[str, int]: + """Full update of a Tuleap instance""" + lister = TuleapLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/tuleap/tests/__init__.py b/swh/lister/tuleap/tests/__init__.py new file mode 100755 diff --git a/swh/lister/tuleap/tests/data/https_tuleap.net/projects b/swh/lister/tuleap/tests/data/https_tuleap.net/projects new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/tests/data/https_tuleap.net/projects @@ -0,0 +1,170 @@ +[ + { + "resources": [ + { + "type": "git", + "uri": "projects/685/git" + }, + { + "type": "trackers", + "uri": "projects/685/trackers" + }, + { + "type": "backlog", + "uri": "projects/685/backlog" + }, + { + "type": "milestones", + "uri": "projects/685/milestones" + }, + { + "type": "plannings", + "uri": "projects/685/plannings" + }, + { + "type": "labeled_items", + "uri": "projects/685/labeled_items" + }, + { + "type": "svn", + "uri": "projects/685/svn" + }, + { + "type": "testmanagement_campaigns", + "uri": "projects/685/testmanagement_campaigns" + }, + { + "type": "testmanagement_definitions", + "uri": "projects/685/testmanagement_definitions" + }, + { + "type": "testmanagement_nodes", + "uri": "projects/685/testmanagement_nodes" + }, + { + "type": "project_services", + "uri": "projects/685/project_services" + }, + { + "type": "user_groups", + "uri": "projects/685/user_groups" + }, + { + "type": "phpwiki", + "uri": "projects/685/phpwiki" + }, + { + "type": "heartbeats", + "uri": "projects/685/heartbeats" + }, + { + "type": "labels", + "uri": "projects/685/labels" + } + ], + "additional_informations": [], + "is_member_of": false, + "description": "Manjaro Memo Documentation est un projet Sphinx portant sur l'utilisation et la maj de Manjaro (et de ses outils) ainsi que sur Systemd et Journactl. Il comprendra tout un ensemble de commande pour se servir correctement de ce système dérivé d'Archlinux.", + "additional_fields": [ + { + "name": "project_desc_name:full_desc", + "value": "" + }, + { + "name": "project_desc_name:other_comments", + "value": "" + } + ], + "id": 685, + "uri": "projects/685", + "label": "Manjaro Memo Documentation", + "shortname": "manjaromemodoc", + "status": "active", + "access": "public", + "is_template": false + }, + { + "resources": [ + { + "type": "git", + "uri": "projects/309/git" + }, + { + "type": "trackers", + "uri": "projects/309/trackers" + }, + { + "type": "backlog", + "uri": "projects/309/backlog" + }, + { + "type": "milestones", + "uri": "projects/309/milestones" + }, + { + "type": "plannings", + "uri": "projects/309/plannings" + }, + { + "type": "labeled_items", + "uri": "projects/309/labeled_items" + }, + { + "type": "svn", + "uri": "projects/309/svn" + }, + { + "type": "testmanagement_campaigns", + "uri": "projects/309/testmanagement_campaigns" + }, + { + "type": "testmanagement_definitions", + "uri": "projects/309/testmanagement_definitions" + }, + { + "type": "testmanagement_nodes", + "uri": "projects/309/testmanagement_nodes" + }, + { + "type": "project_services", + "uri": "projects/309/project_services" + }, + { + "type": "user_groups", + "uri": "projects/309/user_groups" + }, + { + "type": "phpwiki", + "uri": "projects/309/phpwiki" + }, + { + "type": "heartbeats", + "uri": "projects/309/heartbeats" + }, + { + "type": "labels", + "uri": "projects/309/labels" + } + ], + "additional_informations": [], + "is_member_of": false, + "description": "a library for audio and music analysis", + "additional_fields": [ + { + "name": "project_desc_name:full_desc", + "value": "" + }, + { + "name": "project_desc_name:other_comments", + "value": "" + } + ], + "id": 309, + "uri": "projects/309", + "label": "aubio", + "shortname": "aubio", + "status": "active", + "access": "public", + "is_template": false + } +] diff --git a/swh/lister/tuleap/tests/data/https_tuleap.net/repo_1 b/swh/lister/tuleap/tests/data/https_tuleap.net/repo_1 new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/tests/data/https_tuleap.net/repo_1 @@ -0,0 +1,18 @@ +{"repositories": + [ + { + "id": 295, + "uri": "git/295", + "name" : "manjaro-memo-documentation", + "label": "manjaro-memo-documentation", + "path": "manjaromemodoc/manjaro-memo-documentation.git", + "path_without_project": "", + "description": "-- Default description --", + "last_update_date": "2020-10-03T15:27:02+02:00", + "permissions": "None", + "server": "None", + "html_url": "/plugins/git/manjaromemodoc/manjaro-memo-documentation", + "additional_information": [] + } + ] +} \ No newline at end of file diff --git a/swh/lister/tuleap/tests/data/https_tuleap.net/repo_2 b/swh/lister/tuleap/tests/data/https_tuleap.net/repo_2 new file mode 100644 --- /dev/null +++ b/swh/lister/tuleap/tests/data/https_tuleap.net/repo_2 @@ -0,0 +1,18 @@ +{"repositories": + [ + { + "id": 309, + "uri": "git/309", + "name": "myaurora", + "label": "myaurora", + "path": "myaurora/myaurora.git", + "path_without_project": "", + "description": "-- Default description --", + "last_update_date": "2021-03-04T08:43:40+01:00", + "permissions": "None", + "server": "None", + "html_url": "/plugins/git/myaurora/myaurora", + "additional_information": [] + } + ] +} diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/tests/test_lister.py @@ -0,0 +1,144 @@ +# Copyright (C) 2017-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import pytest +import requests + +from swh.lister.tuleap.lister import RepoPage, TuleapLister +from swh.scheduler.model import ListedOrigin + +TULEAP_URL = "https://tuleap.net/" +TULEAP_PROJECTS_URL = TULEAP_URL + "api/projects/" +TULEAP_REPO_1_URL = TULEAP_URL + "api/projects/685/git" # manjaromemodoc +TULEAP_REPO_2_URL = TULEAP_URL + "api/projects/309/git" # myaurora + +GIT_REPO_1 = ( + "https://tuleap.net/plugins/git/manjaromemodoc/manjaro-memo-documentation.git" +) +GIT_REPO_1 = "https://tuleap.net/plugins/git/myaurora/myaurora.git" + + +@pytest.fixture +def tuleap_projects(datadir) -> Tuple[str, Dict[str, str], List[str]]: + text = Path(datadir, "https_tuleap.net", "projects").read_text() + headers = { + "X-PAGINATION-LIMIT-MAX": "50", + "X-PAGINATION-LIMIT": "10", + "X-PAGINATION-SIZE": "2", + } + repo_json = json.loads(text) + projects = [p["shortname"] for p in repo_json] + return text, headers, projects + + +@pytest.fixture +def tuleap_repo_1(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: + text = Path(datadir, "https_tuleap.net", "repo_1").read_text() + headers = { + "X-PAGINATION-LIMIT-MAX": "50", + "X-PAGINATION-LIMIT": "10", + "X-PAGINATION-SIZE": "1", + } + reps = json.loads(text) + page_results = [] + for r in reps["repositories"]: + page_results.append( + TuleapLister.results_simplified(url=TULEAP_URL, rtype="git", repo=r) + ) + origin_urls = [r["uri"] for r in page_results] + return text, headers, page_results, origin_urls + + +@pytest.fixture +def tuleap_repo_2(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: + text = Path(datadir, "https_tuleap.net", "repo_2").read_text() + headers = { + "X-PAGINATION-LIMIT-MAX": "50", + "X-PAGINATION-LIMIT": "10", + "X-PAGINATION-SIZE": "1", + } + reps = json.loads(text) + page_results = [] + for r in reps["repositories"]: + page_results.append( + TuleapLister.results_simplified(url=TULEAP_URL, rtype="git", repo=r) + ) + origin_urls = [r["uri"] for r in page_results] + return text, headers, page_results, origin_urls + + +def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): + """Asserts that the two collections have the same origin URLs. + + Does not test last_update.""" + + sorted_lister_urls = list(sorted(lister_urls)) + sorted_scheduler_origins = list(sorted(scheduler_origins)) + + assert len(sorted_lister_urls) == len(sorted_scheduler_origins) + + for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): + assert l_url == s_origin.url + + +def test_tuleap_full_listing( + swh_scheduler, requests_mock, mocker, tuleap_projects, tuleap_repo_1, tuleap_repo_2 +): + """Covers full listing of multiple pages, rate-limit, page size (required for test), + checking page results and listed origins, statelessness.""" + + kwargs = dict(url=TULEAP_URL, instance="tuleap.net") + lister = TuleapLister(scheduler=swh_scheduler, **kwargs) + + p_text, p_headers, p_projects = tuleap_projects + r1_text, r1_headers, r1_result, r1_origin_urls = tuleap_repo_1 + r2_text, r2_headers, r2_result, r2_origin_urls = tuleap_repo_2 + + requests_mock.get(TULEAP_PROJECTS_URL, text=p_text, headers=p_headers) + requests_mock.get(TULEAP_REPO_1_URL, text=r1_text, headers=r1_headers) + requests_mock.get( + TULEAP_REPO_2_URL, + [ + {"status_code": requests.codes.too_many_requests}, + {"text": r2_text, "headers": r2_headers}, + ], + ) + + # end test setup + + stats = lister.run() + + # start test checks + + assert stats.pages == 2 + assert stats.origins == 2 + + # calls = [mocker.call(r1_result), mocker.call(r2_result)] + # lister.get_origins_from_page.assert_has_calls(calls) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + check_listed_origins(r1_origin_urls + r2_origin_urls, scheduler_origins) + + assert lister.get_state_from_scheduler() is None + + +@pytest.mark.parametrize("http_code", [400, 500, 502]) +def test_tuleap_list_http_error(swh_scheduler, requests_mock, http_code): + """Test handling of some HTTP errors commonly encountered""" + + lister = TuleapLister(scheduler=swh_scheduler, url=TULEAP_URL) + + requests_mock.get(TULEAP_PROJECTS_URL, status_code=http_code) + + with pytest.raises(requests.HTTPError): + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 0 diff --git a/swh/lister/tuleap/tests/test_tasks.py b/swh/lister/tuleap/tests/test_tasks.py new file mode 100755 --- /dev/null +++ b/swh/lister/tuleap/tests/test_tasks.py @@ -0,0 +1,50 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.tuleap.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.tuleap.tasks.TuleapLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://tuleap.net") + res = swh_scheduler_celery_app.send_task( + "swh.lister.tuleap.tasks.FullTuleapLister", kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + +def test_full_listing_params( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.tuleap.tasks.TuleapLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://tuleap.net", instance="tuleap.net",) + res = swh_scheduler_celery_app.send_task( + "swh.lister.tuleap.tasks.FullTuleapLister", kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with()