Changeset View
Standalone View
swh/lister/gogs/tests/test_lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
from pathlib import Path | from pathlib import Path | ||||
from typing import List | from typing import List | ||||
from unittest.mock import Mock | from unittest.mock import Mock | ||||
import pytest | import pytest | ||||
from requests import HTTPError | from requests import HTTPError | ||||
from swh.lister.gogs.lister import GogsLister | from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
TRY_GOGS_URL = "https://try.gogs.io/api/v1/" | TRY_GOGS_URL = "https://try.gogs.io/api/v1/" | ||||
def try_gogs_page(n: int): | def try_gogs_page(n: int): | ||||
return TRY_GOGS_URL + f"repos/search?page={n}&limit=3" | return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3" | ||||
P1 = try_gogs_page(1) | |||||
P2 = try_gogs_page(2) | |||||
P3 = try_gogs_page(3) | |||||
P4 = try_gogs_page(4) | |||||
@pytest.fixture | @pytest.fixture | ||||
def trygogs_p1(datadir): | def trygogs_p1(datadir): | ||||
text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() | text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() | ||||
headers = { | headers = {"Link": f'<{P2}>; rel="next"'} | ||||
"Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2)) | page_result = GogsListerPage( | ||||
} | repos=GogsLister.extract_repos(json.loads(text)), next_link=P2 | ||||
page_result = GogsLister.results_simplified(json.loads(text)) | ) | ||||
origin_urls = [r["clone_url"] for r in page_result] | origin_urls = [r["clone_url"] for r in page_result.repos] | ||||
return text, headers, page_result, origin_urls | return text, headers, page_result, origin_urls | ||||
@pytest.fixture | @pytest.fixture | ||||
def trygogs_p2(datadir): | def trygogs_p2(datadir): | ||||
text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() | text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() | ||||
headers = { | headers = {"Link": f'<{P3}>; rel="next",<{P1}>; rel="prev"'} | ||||
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) | page_result = GogsListerPage( | ||||
} | repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 | ||||
page_result = GogsLister.results_simplified(json.loads(text)) | ) | ||||
origin_urls = [r["clone_url"] for r in page_result] | origin_urls = [r["clone_url"] for r in page_result.repos] | ||||
return text, headers, page_result, origin_urls | |||||
@pytest.fixture | |||||
def trygogs_p3(datadir): | |||||
text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() | |||||
headers = {"Link": f'<{P4}>; rel="next",<{P2}>; rel="prev"'} | |||||
page_result = GogsListerPage( | |||||
repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 | |||||
) | |||||
origin_urls = [r["clone_url"] for r in page_result.repos] | |||||
return text, headers, page_result, origin_urls | |||||
@pytest.fixture | |||||
def trygogs_p4(datadir): | |||||
text = Path(datadir, "https_try.gogs.io", "repos_page4").read_text() | |||||
headers = {"Link": f'<{P3}>; rel="prev"'} | |||||
page_result = GogsListerPage( | |||||
repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 | |||||
) | |||||
origin_urls = [r["clone_url"] for r in page_result.repos] | |||||
return text, headers, page_result, origin_urls | |||||
@pytest.fixture | |||||
def trygogs_p3_last(datadir): | |||||
text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() | |||||
headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} | |||||
page_result = GogsListerPage( | |||||
repos=GogsLister.extract_repos(json.loads(text)), next_link=None | |||||
) | |||||
origin_urls = [r["clone_url"] for r in page_result.repos] | |||||
return text, headers, page_result, origin_urls | return text, headers, page_result, origin_urls | ||||
@pytest.fixture | @pytest.fixture | ||||
def trygogs_empty_page(): | def trygogs_p3_empty(): | ||||
origins_urls = [] | origins_urls = [] | ||||
page_result = {"data": [], "ok": True} | body = {"data": [], "ok": True} | ||||
headers = { | headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} | ||||
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1)) | page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) | ||||
} | text = json.dumps(body) | ||||
text = json.dumps(page_result) | |||||
return text, headers, page_result, origins_urls | return text, headers, page_result, origins_urls | ||||
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): | ||||
"""Asserts that the two collections have the same origin URLs. | """Asserts that the two collections have the same origin URLs. | ||||
Does not test last_update.""" | Does not test last_update.""" | ||||
sorted_lister_urls = list(sorted(lister_urls)) | sorted_lister_urls = list(sorted(lister_urls)) | ||||
sorted_scheduler_origins = list(sorted(scheduler_origins)) | sorted_scheduler_origins = list(sorted(scheduler_origins)) | ||||
assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | assert len(sorted_lister_urls) == len(sorted_scheduler_origins) | ||||
for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): | ||||
assert l_url == s_origin.url | assert l_url == s_origin.url | ||||
def test_gogs_full_listing( | def test_gogs_full_listing( | ||||
swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page | swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3_last | ||||
): | ): | ||||
kwargs = dict( | kwargs = dict( | ||||
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" | url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" | ||||
) | ) | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs) | lister = GogsLister(scheduler=swh_scheduler, **kwargs) | ||||
lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") | lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") | ||||
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 | p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 | ||||
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 | p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 | ||||
p3_text, p3_headers, _, _ = trygogs_empty_page | p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last | ||||
requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers) | requests_mock.get(P1, text=p1_text, headers=p1_headers) | ||||
requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) | requests_mock.get(P2, text=p2_text, headers=p2_headers) | ||||
requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers) | requests_mock.get(P3, text=p3_text, headers=p3_headers) | ||||
stats = lister.run() | stats = lister.run() | ||||
assert stats.pages == 2 | assert stats.pages == 3 | ||||
assert stats.origins == 6 | assert stats.origins == 9 | ||||
calls = [mocker.call(p1_result), mocker.call(p2_result)] | calls = map(mocker.call, [p1_result, p2_result, p3_result]) | ||||
lister.get_origins_from_page.assert_has_calls(calls) | lister.get_origins_from_page.assert_has_calls(list(calls)) | ||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | ||||
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) | check_listed_origins( | ||||
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins | |||||
) | |||||
assert lister.get_state_from_scheduler() is None | assert ( | ||||
lister.get_state_from_scheduler().last_seen_next_link == P3 | |||||
) # P3 didn't provide any next link so it remains the last_seen_next_link | |||||
def test_gogs_auth_instance( | def test_gogs_auth_instance( | ||||
swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page | swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_p3_empty | ||||
): | ): | ||||
"""Covers token authentication, token from credentials, | """Covers token authentication, token from credentials, | ||||
instance inference from URL.""" | instance inference from URL.""" | ||||
api_token = "secret" | api_token = "secret" | ||||
instance = "try.gogs.io" | instance = "try_gogs" | ||||
creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} | |||||
kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) | # Test lister initialization without api_token or credentials: | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs1) | with pytest.raises(ValueError, match="No credentials or API token provided"): | ||||
kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) | |||||
GogsLister(scheduler=swh_scheduler, **kwargs1) | |||||
# test API token | # Test lister initialization using api_token: | ||||
assert "Authorization" in lister.session.headers | kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs2) | |||||
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | ||||
with pytest.raises(ValueError, match="No credentials or API token provided"): | # Test lister initialization with credentials and run it: | ||||
kwargs2 = dict(url=TRY_GOGS_URL, instance=instance) | creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} | ||||
GogsLister(scheduler=swh_scheduler, **kwargs2) | |||||
kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) | kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) | ||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs3) | lister = GogsLister(scheduler=swh_scheduler, **kwargs3) | ||||
# test API token from credentials | |||||
assert "Authorization" in lister.session.headers | |||||
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | assert lister.session.headers["Authorization"].lower() == "token %s" % api_token | ||||
assert lister.instance == "try_gogs" | |||||
# test instance inference from URL | |||||
assert lister.instance | |||||
assert "gogs" in lister.instance | |||||
# setup requests mocking | # setup requests mocking | ||||
p1_text, p1_headers, _, _ = trygogs_p1 | p1_text, p1_headers, _, _ = trygogs_p1 | ||||
p2_text, p2_headers, _, _ = trygogs_empty_page | p2_text, p2_headers, _, _ = trygogs_p2 | ||||
p3_text, p3_headers, _, _ = trygogs_p3_empty | |||||
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH | requests_mock.get(P1, text=p1_text, headers=p1_headers) | ||||
requests_mock.get(base_url, text=p1_text, headers=p1_headers) | requests_mock.get(P2, text=p2_text, headers=p2_headers) | ||||
requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers) | requests_mock.get(P3, text=p3_text, headers=p3_headers) | ||||
# now check the lister runs without error | |||||
stats = lister.run() | |||||
assert stats.pages == 2 | # lister should run without any error and extract the origins | ||||
assert stats.origins == 3 | stats = lister.run() | ||||
assert stats.pages == 3 | |||||
assert stats.origins == 6 | |||||
@pytest.mark.parametrize("http_code", [400, 500, 502]) | @pytest.mark.parametrize("http_code", [400, 500, 502]) | ||||
def test_gogs_list_http_error(swh_scheduler, requests_mock, http_code): | def test_gogs_list_http_error( | ||||
swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last | |||||
): | |||||
"""Test handling of some HTTP errors commonly encountered""" | """Test handling of some HTTP errors commonly encountered""" | ||||
lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") | lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") | ||||
p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 | |||||
p3_text, p3_headers, _, _ = trygogs_p3_last | |||||
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH | base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH | ||||
requests_mock.get(base_url, status_code=http_code) | requests_mock.get( | ||||
base_url, | |||||
[ | |||||
{"text": p1_text, "headers": p1_headers, "status_code": 200}, | |||||
{"status_code": http_code}, | |||||
{"text": p3_text, "headers": p3_headers, "status_code": 200}, | |||||
], | |||||
) | |||||
with pytest.raises(HTTPError): | with pytest.raises(HTTPError): | ||||
lister.run() | lister.run() | ||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | ||||
assert len(scheduler_origins) == 0 | check_listed_origins( | ||||
p1_origin_urls, scheduler_origins | |||||
) # Only the first page is listed | |||||
def test_gogs_incremental_lister( | |||||
swh_scheduler, | |||||
requests_mock, | |||||
mocker, | |||||
trygogs_p1, | |||||
trygogs_p2, | |||||
trygogs_p3, | |||||
trygogs_p3_last, | |||||
trygogs_p3_empty, | |||||
trygogs_p4, | |||||
): | |||||
kwargs = dict( | |||||
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" | |||||
) | |||||
lister = GogsLister(scheduler=swh_scheduler, **kwargs) | |||||
lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") | |||||
# First listing attempt: P1 and P2 return 3 origins each | |||||
# while P3 (current last page) is empty. | |||||
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 | |||||
vlorentz: Does it really mock the first listing attempt? I would expect the second request (with `P2`)… | |||||
Done Inline Actions
If P2 doesn't provide a "next" link header then the lister will throw an error because of assert len(response.links) > 0, "API changed: no Link header found" (same happens for Gitea lister) What is the difference between a normal lister and an incremental one? KShivendu: > I would expect the second request (with P2) not to have a Link header
If `P2` doesn't… | |||||
Done Inline Actions*If P2 doesn't provide a But, if it does provide a link header without a "next" link, then the first attempt will stop at page 2 and last_seen_next_link will be None. Now when lister.run() is used for the second time, it will recrawl both the pages again. In the current test, the first lister fails and last_seen_next_link points to page 3. Hence in the second attempt, the lister only crawls that. Isn't this the expected behavior? Am I missing something? KShivendu: *If `P2` doesn't provide a ~~"next"~~ link header
But, if it does provide a link header… | |||||
Not Done Inline ActionsIt doesn't make sense for the last page to link to another page that doesn't exist. I didn't try with Gogs as their API requires authentication, but with Gitea: $ curl -i https://try.gitea.io/api/v1/repos/search/\?limit\=3 | grep "^link" link: <https://try.gitea.io/api/v1/repos/search/?limit=3&page=2>; rel="next",<https://try.gitea.io/api/v1/repos/search/?limit=3&page=1753>; rel="last" $ curl -i https://try.gitea.io/api/v1/repos/search/\?limit\=3\&page\=1753 | grep "^link" link: <https://try.gitea.io/api/v1/repos/search/?limit=3&page=1>; rel="first",<https://try.gitea.io/api/v1/repos/search/?limit=3&page=1752>; rel="prev" So there is indeed a link header, but no next link. vlorentz: It doesn't make sense for the last page to link to another page that doesn't exist.
I didn't… | |||||
Done Inline Actions
Actually, there are instances like T4423 where certain pages cannot be crawled because of some fatal repos. I've created an issue in the upstream but we haven't received any update from the Gogs maintainers. I proposed T1721#88903 to fix this. You can read @ardumont's opinion on this in T1721#88915 KShivendu: > It doesn't make sense for the last page to link to another page that doesn't exist.
Actually… | |||||
Not Done Inline Actionsbut that's not what this test does. This test is written as a "normal" listing run and ends with a 400. T4423 is a bug, which is not normal, and ends with a 500. vlorentz: but that's not what this test does. This test is written as a "normal" listing run and ends… | |||||
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 | |||||
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_empty | |||||
requests_mock.get(P1, text=p1_text, headers=p1_headers) | |||||
requests_mock.get(P2, text=p2_text, headers=p2_headers) | |||||
requests_mock.get(P3, text=p3_text, headers=p3_headers) | |||||
attempt1_stats = lister.run() | |||||
assert attempt1_stats.pages == 3 | |||||
assert attempt1_stats.origins == 6 | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
lister_state = lister.get_state_from_scheduler() | |||||
assert lister_state.last_seen_next_link == P3 | |||||
assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] | |||||
assert lister.updated | |||||
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) | |||||
lister.updated = False # Reset the flag | |||||
# Second listing attempt: P3 isn't empty anymore. | |||||
# The lister should restart from last state and hence revisit P3. | |||||
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last | |||||
requests_mock.get(P3, text=p3_text, headers=p3_headers) | |||||
lister.session.get = mocker.spy(lister.session, "get") | |||||
attempt2_stats = lister.run() | |||||
assert attempt2_stats.pages == 1 | |||||
assert attempt2_stats.origins == 3 | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
page_id = _parse_page_id(lister_state.last_seen_next_link) | |||||
query_params = lister.query_params | |||||
query_params["page"] = page_id | |||||
lister.session.get.assert_called_once_with( | |||||
TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params | |||||
) | |||||
# All the 9 origins (3 pages) should be passed on to the scheduler: | |||||
check_listed_origins( | |||||
p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins | |||||
) | |||||
lister_state = lister.get_state_from_scheduler() | |||||
assert lister_state.last_seen_next_link == P3 | |||||
assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] | |||||
assert lister.updated | |||||
lister.updated = False # Reset the flag | |||||
# Third listing attempt: No new origins | |||||
# The lister should revisit last seen page (P3) | |||||
attempt3_stats = lister.run() | |||||
assert attempt3_stats.pages == 1 | |||||
assert attempt3_stats.origins == 3 | |||||
lister_state = lister.get_state_from_scheduler() | |||||
assert lister_state.last_seen_next_link == P3 | |||||
assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] | |||||
assert lister.updated is False # No new origins so state isn't updated. | |||||
# Fourth listing attempt: Page 4 is introduced and returns 3 new origins | |||||
# The lister should revisit last seen page (P3) as well as P4. | |||||
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 # new P3 points to P4 | |||||
p4_text, p4_headers, p4_result, p4_origin_urls = trygogs_p4 | |||||
requests_mock.get(P3, text=p3_text, headers=p3_headers) | |||||
requests_mock.get(P4, text=p4_text, headers=p4_headers) | |||||
attempt4_stats = lister.run() | |||||
assert attempt4_stats.pages == 2 | |||||
assert attempt4_stats.origins == 6 | |||||
lister_state = lister.get_state_from_scheduler() | |||||
assert lister_state.last_seen_next_link == P4 | |||||
assert lister_state.last_seen_repo_id == p4_result.repos[-1]["id"] | |||||
assert lister.updated | |||||
# All the 12 origins (4 pages) should be passed on to the scheduler: | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results | |||||
check_listed_origins( | |||||
p1_origin_urls + p2_origin_urls + p3_origin_urls + p4_origin_urls, | |||||
scheduler_origins, | |||||
) |
Does it really mock the first listing attempt? I would expect the second request (with P2) not to have a Link header, so the lister wouldn't send the third request (with P3).