Changeset View
Standalone View
swh/lister/gogs/tests/test_lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
from pathlib import Path | from pathlib import Path | ||||
from typing import List | from typing import List | ||||
from unittest.mock import Mock | from unittest.mock import Mock | ||||
import pytest | import pytest | ||||
from requests import HTTPError | from requests import HTTPError | ||||
from swh.lister.gogs.lister import GogsLister | from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
TRY_GOGS_URL = "https://try.gogs.io/api/v1/" | TRY_GOGS_URL = "https://try.gogs.io/api/v1/" | ||||
def try_gogs_page(n: int): | def try_gogs_page(n: int): | ||||
return TRY_GOGS_URL + f"repos/search?page={n}&limit=3" | return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3" | ||||
P1 = try_gogs_page(1) | |||||
P2 = try_gogs_page(2) | |||||
P3 = try_gogs_page(3) | |||||
@pytest.fixture | @pytest.fixture | ||||
def trygogs_p1(datadir): | def trygogs_p1(datadir): | ||||
text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() | text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() | ||||
headers = { | headers = {"Link": '<{p2}>; rel="next",<{p3}>; rel="last"'.format(p2=P2, p3=P3)} | ||||
"Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2)) | page_result = GogsListerPage( | ||||
} | repos=GogsLister.extract_repos(json.loads(text)), next_link=P2 | ||||
page_result = GogsLister.results_simplified(json.loads(text)) | ) | ||||
origin_urls = [r["clone_url"] for r in page_result] | origin_urls = [r["clone_url"] for r in page_result.repos] | ||||
return text, headers, page_result, origin_urls | return text, headers, page_result, origin_urls | ||||
@pytest.fixture | @pytest.fixture | ||||
def trygogs_p2(datadir): | def trygogs_p2(datadir): | ||||
text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() | text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() | ||||
headers = { | headers = {"Link": '<{p3}>; rel="next",<{p1}>; rel="prev"'.format(p1=P1, p3=P3)} | ||||
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) | page_result = GogsListerPage( | ||||
} | repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 | ||||
page_result = GogsLister.results_simplified(json.loads(text)) | ) | ||||
origin_urls = [r["clone_url"] for r in page_result] | origin_urls = [r["clone_url"] for r in page_result.repos] | ||||
return text, headers, page_result, origin_urls | |||||
@pytest.fixture | |||||
def trygogs_p3(datadir): | |||||
text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() | |||||
headers = {"Link": '<{p2}>; rel="prev",<{p1}>; rel="first"'.format(p1=P1, p2=P2)} | |||||
page_result = GogsListerPage( | |||||
repos=GogsLister.extract_repos(json.loads(text)), next_link=None | |||||
) | |||||
origin_urls = [r["clone_url"] for r in page_result.repos] | |||||
return text, headers, page_result, origin_urls | return text, headers, page_result, origin_urls | ||||
@pytest.fixture | @pytest.fixture | ||||
def trygogs_empty_page(): | def trygogs_empty_p3(): | ||||
origins_urls = [] | origins_urls = [] | ||||
page_result = {"data": [], "ok": True} | body = {"data": [], "ok": True} | ||||
headers = { | headers = { | ||||
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) | "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) | ||||
} | } | ||||
text = json.dumps(page_result) | page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) | ||||
text = json.dumps(body) | |||||
return text, headers, page_result, origins_urls | return text, headers, page_result, origins_urls | ||||
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | ||||
"""Asserts that the two collections have the same origin URLs. | """Asserts that the two collections have the same origin URLs. | ||||
Does not test last_update.""" | Does not test last_update.""" | ||||
sorted_lister_urls = list(sorted(lister_urls)) | sorted_lister_urls = list(sorted(lister_urls)) | ||||
sorted_scheduler_origins = list(sorted(scheduler_origins)) | sorted_scheduler_origins = list(sorted(scheduler_origins)) | ||||
assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | ||||
for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | ||||
assert l_url == s_origin.url | assert l_url == s_origin.url | ||||
def test_gogs_full_listing( | def test_gogs_full_listing( | ||||
swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page | swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3 | ||||
): | ): | ||||
kwargs = dict( | kwargs = dict( | ||||
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" | url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" | ||||
) | ) | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs) | lister = GogsLister(scheduler=swh_scheduler, **kwargs) | ||||
lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") | lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") | ||||
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 | p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 | ||||
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 | p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 | ||||
p3_text, p3_headers, _, _ = trygogs_empty_page | p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 | ||||
requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers) | requests_mock.get(P1, text=p1_text, headers=p1_headers) | ||||
requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) | requests_mock.get(P2, text=p2_text, headers=p2_headers) | ||||
requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers) | requests_mock.get(P3, text=p3_text, headers=p3_headers) | ||||
stats = lister.run() | stats = lister.run() | ||||
assert stats.pages == 2 | assert stats.pages == 3 | ||||
assert stats.origins == 6 | assert stats.origins == 9 | ||||
calls = [mocker.call(p1_result), mocker.call(p2_result)] | calls = map(mocker.call, [p1_result, p2_result, p3_result]) | ||||
lister.get_origins_from_page.assert_has_calls(calls) | lister.get_origins_from_page.assert_has_calls(list(calls)) | ||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | ||||
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) | check_listed_origins( | ||||
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins | |||||
) | |||||
assert lister.get_state_from_scheduler() is None | assert ( | ||||
lister.get_state_from_scheduler().last_seen_next_link == P3 | |||||
) # TODO: FIXME It should be None? | |||||
def test_gogs_auth_instance( | def test_gogs_auth_instance( | ||||
swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page | swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_empty_p3 | ||||
): | ): | ||||
"""Covers token authentication, token from credentials, | """Covers token authentication, token from credentials, | ||||
instance inference from URL.""" | instance inference from URL.""" | ||||
api_token = "secret" | api_token = "secret" | ||||
instance = "try.gogs.io" | instance = "try_gogs" | ||||
creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} | |||||
kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) | # Test lister initialization without api_token or credentials: | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs1) | with pytest.raises(ValueError, match="No credentials or API token provided"): | ||||
kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) | |||||
GogsLister(scheduler=swh_scheduler, **kwargs1) | |||||
# test API token | # Test lister initialization using api_token: | ||||
assert "Authorization" in lister.session.headers | kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs2) | |||||
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | ||||
with pytest.raises(ValueError, match="No credentials or API token provided"): | # Test lister initialization with credentials: | ||||
kwargs2 = dict(url=TRY_GOGS_URL, instance=instance) | creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} | ||||
GogsLister(scheduler=swh_scheduler, **kwargs2) | |||||
kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) | kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs3) | lister = GogsLister(scheduler=swh_scheduler, **kwargs3) | ||||
# test API token from credentials | |||||
assert "Authorization" in lister.session.headers | |||||
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | ||||
assert lister.instance == "try_gogs" | |||||
# test instance inference from URL | |||||
assert lister.instance | |||||
assert "gogs" in lister.instance | |||||
# setup requests mocking | # setup requests mocking | ||||
p1_text, p1_headers, _, _ = trygogs_p1 | p1_text, p1_headers, _, _ = trygogs_p1 | ||||
p2_text, p2_headers, _, _ = trygogs_empty_page | p2_text, p2_headers, _, _ = trygogs_p2 | ||||
p3_text, p3_headers, _, _ = trygogs_empty_p3 | |||||
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH | requests_mock.get(P1, text=p1_text, headers=p1_headers) | ||||
requests_mock.get(base_url, text=p1_text, headers=p1_headers) | requests_mock.get(P2, text=p2_text, headers=p2_headers) | ||||
requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) | requests_mock.get(P3, text=p3_text, headers=p3_headers) | ||||
# now check the lister runs without error | |||||
stats = lister.run() | |||||
assert stats.pages == 2 | # lister should run without any error and extract the origins | ||||
assert stats.origins == 3 | stats = lister.run() | ||||
assert stats.pages == 3 | |||||
assert stats.origins == 6 | |||||
@pytest.mark.parametrize("http_code", [400, 500, 502]) | @pytest.mark.parametrize("http_code", [400, 500, 502]) | ||||
def test_gogs_list_http_error(swh_scheduler, requests_mock, http_code): | def test_gogs_list_http_error( | ||||
swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3 | |||||
): | |||||
"""Test handling of some HTTP errors commonly encountered""" | """Test handling of some HTTP errors commonly encountered""" | ||||
lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") | lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") | ||||
p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 | |||||
p3_text, p3_headers, _, _ = trygogs_p3 | |||||
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH | base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH | ||||
requests_mock.get(base_url, status_code=http_code) | requests_mock.get( | ||||
base_url, | |||||
[ | |||||
{"text": p1_text, "headers": p1_headers, "status_code": 200}, | |||||
{"status_code": http_code}, | |||||
{"text": p3_text, "headers": p3_headers, "status_code": 200}, | |||||
], | |||||
) | |||||
with pytest.raises(HTTPError): | with pytest.raises(HTTPError): | ||||
lister.run() | lister.run() | ||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | ||||
assert len(scheduler_origins) == 0 | check_listed_origins( | ||||
p1_origin_urls, scheduler_origins | |||||
) # Only the first page is listed | |||||
def test_gogs_incremental_lister( | |||||
swh_scheduler, | |||||
requests_mock, | |||||
mocker, | |||||
trygogs_p1, | |||||
trygogs_p2, | |||||
trygogs_empty_p3, | |||||
trygogs_p3, | |||||
): | |||||
kwargs = dict( | |||||
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" | |||||
) | |||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs) | |||||
lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") | |||||
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 | |||||
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 | |||||
( | |||||
p3_text, | |||||
p3_headers, | |||||
p3_result, | |||||
vlorentz: Does it really mock the first listing attempt? I would expect the second request (with `P2`)… | |||||
Done Inline Actions
If P2 doesn't provide a "next" link header then the lister will throw an error because of assert len(response.links) > 0, "API changed: no Link header found" (same happens for Gitea lister) What is the difference between a normal lister and an incremental one? KShivendu: > I would expect the second request (with P2) not to have a Link header
If `P2` doesn't… | |||||
Done Inline Actions*If P2 doesn't provide a But, if it does provide a link header without a "next" link, then the first attempt will stop at page 2 and last_seen_next_link will be None. Now when lister.run() is used for the second time, it will recrawl both the pages again. In the current test, the first lister fails and last_seen_next_link points to page 3. Hence in the second attempt, the lister only crawls that. Isn't this the expected behavior? Am I missing something? KShivendu: *If `P2` doesn't provide a ~~"next"~~ link header
But, if it does provide a link header… | |||||
Not Done Inline ActionsIt doesn't make sense for the last page to link to another page that doesn't exist. I didn't try with Gogs as their API requires authentication, but with Gitea: $ curl -i https://try.gitea.io/api/v1/repos/search/\?limit\=3 | grep "^link" link: <https://try.gitea.io/api/v1/repos/search/?limit=3&page=2>; rel="next",<https://try.gitea.io/api/v1/repos/search/?limit=3&page=1753>; rel="last" $ curl -i https://try.gitea.io/api/v1/repos/search/\?limit\=3\&page\=1753 | grep "^link" link: <https://try.gitea.io/api/v1/repos/search/?limit=3&page=1>; rel="first",<https://try.gitea.io/api/v1/repos/search/?limit=3&page=1752>; rel="prev" So there is indeed a link header, but no next link. vlorentz: It doesn't make sense for the last page to link to another page that doesn't exist.
I didn't… | |||||
Done Inline Actions
Actually, there are instances like T4423 where certain pages cannot be crawled because of some fatal repos. I've created an issue in the upstream but we haven't received any update from the Gogs maintainers. I proposed T1721#88903 to fix this. You can read @ardumont's opinion on this in T1721#88915 KShivendu: > It doesn't make sense for the last page to link to another page that doesn't exist.
Actually… | |||||
Not Done Inline Actionsbut that's not what this test does. This test is written as a "normal" listing run and ends with a 400. T4423 is a bug, which is not normal, and ends with a 500. vlorentz: but that's not what this test does. This test is written as a "normal" listing run and ends… | |||||
p3_origin_urls, | |||||
) = trygogs_empty_p3 # Note: 3rd page is empty in the first listing | |||||
requests_mock.get(P1, text=p1_text, headers=p1_headers) | |||||
requests_mock.get(P2, text=p2_text, headers=p2_headers) | |||||
requests_mock.get(P3, text=p3_text, headers=p3_headers) | |||||
# First listing: | |||||
stats = lister.run() | |||||
assert stats.pages == 3 | |||||
assert stats.origins == 6 | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
assert lister.updated | |||||
lister_state = lister.get_state_from_scheduler() | |||||
last_seen_next_link = lister_state.last_seen_next_link | |||||
assert last_seen_next_link == P3 | |||||
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) | |||||
# Second listing should restart from last state: | |||||
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 | |||||
requests_mock.get(P3, text=p3_text, headers=p3_headers) | |||||
lister.session.get = mocker.spy(lister.session, "get") | |||||
stats_restart = lister.run() | |||||
assert stats_restart.pages == 1 | |||||
assert stats_restart.origins == 3 | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
page_id = _parse_page_id(last_seen_next_link) | |||||
query_params = lister.query_params | |||||
query_params["page"] = page_id | |||||
lister.session.get.assert_called_once_with( | |||||
TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params | |||||
) | |||||
# All the 9 origins (3 pages) should be passed on to the scheduler: | |||||
check_listed_origins( | |||||
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins | |||||
) |
Does it really mock the first listing attempt? I would expect the second request (with P2) not to have a Link header, so the lister wouldn't send the third request (with P3).