Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/gitlab/tests/test_lister.py
# Copyright (C) 2017-2021 The Software Heritage developers | # Copyright (C) 2017-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import logging | import logging | ||||
from pathlib import Path | from pathlib import Path | ||||
from typing import Dict, List | from typing import Dict, List | ||||
from urllib.parse import quote | |||||
import pytest | import pytest | ||||
from requests.status_codes import codes | from requests.status_codes import codes | ||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||
from swh.lister.gitlab.lister import GitLabLister, _parse_id_after | from swh.lister.gitlab.lister import GitLabLister, _parse_id_after | ||||
from swh.lister.pattern import ListerStats | from swh.lister.pattern import ListerStats | ||||
from swh.lister.tests.test_utils import assert_sleep_calls | from swh.lister.tests.test_utils import assert_sleep_calls | ||||
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock): | ||||
assert len(scheduler_origins) == expected_nb_origins | assert len(scheduler_origins) == expected_nb_origins | ||||
for listed_origin in scheduler_origins: | for listed_origin in scheduler_origins: | ||||
assert listed_origin.visit_type == "hg" | assert listed_origin.visit_type == "hg" | ||||
assert listed_origin.url.startswith(f"https://{instance}") | assert listed_origin.url.startswith(f"https://{instance}") | ||||
assert listed_origin.last_update is not None | assert listed_origin.last_update is not None | ||||
def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]: | def gitlab_page_response(datadir, instance: str, page_num: int) -> List[Dict]: | ||||
"""Return list of repositories (out of test dataset)""" | """Return list of repositories (out of test dataset)""" | ||||
datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json") | datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_num}.json") | ||||
return json.loads(datapath.read_text()) if datapath.exists else [] | return json.loads(datapath.read_text()) if datapath.exists else [] | ||||
def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir): | def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir): | ||||
"""Gitlab lister supports pagination""" | """Gitlab lister supports pagination""" | ||||
instance = "gite.lirmm.fr" | instance = "gite.lirmm.fr" | ||||
lister = GitLabLister(swh_scheduler, url=api_url(instance)) | lister = GitLabLister(swh_scheduler, url=api_url(instance)) | ||||
Show All 31 Lines | |||||
def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir): | def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir): | ||||
"""Gitlab lister supports incremental visits""" | """Gitlab lister supports incremental visits""" | ||||
instance = "gite.lirmm.fr" | instance = "gite.lirmm.fr" | ||||
url = api_url(instance) | url = api_url(instance) | ||||
lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) | lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) | ||||
url_page1 = lister.page_url() | url_page1 = lister.page_url() | ||||
response1 = gitlab_page_response(datadir, instance, 1) | response1 = gitlab_page_response(datadir, instance, 1) | ||||
url_page2 = lister.page_url(2) | url_page2 = lister.page_url(response1[-1]["id"]) | ||||
response2 = gitlab_page_response(datadir, instance, 2) | response2 = gitlab_page_response(datadir, instance, 2) | ||||
url_page3 = lister.page_url(3) | url_page3 = lister.page_url(response2[-1]["id"]) | ||||
response3 = gitlab_page_response(datadir, instance, 3) | response3 = gitlab_page_response(datadir, instance, 3) | ||||
requests_mock.get( | requests_mock.get( | ||||
url_page1, | url_page1, | ||||
[{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], | [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], | ||||
additional_matcher=_match_request, | additional_matcher=_match_request, | ||||
) | ) | ||||
requests_mock.get( | requests_mock.get( | ||||
url_page2, | url_page2, | ||||
[{"json": response2}], | [{"json": response2}], | ||||
additional_matcher=_match_request, | additional_matcher=_match_request, | ||||
) | ) | ||||
listed_result = lister.run() | listed_result = lister.run() | ||||
expected_nb_origins = len(response1) + len(response2) | expected_nb_origins = len(response1) + len(response2) | ||||
assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) | assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) | ||||
assert lister.state.last_seen_next_link == url_page2 | assert lister.state.last_listing_date == lister.listing_date | ||||
lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) | lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) | ||||
# Lister will start back at the last stop | url_page2 = lister2.page_url() | ||||
response2 = gitlab_page_response(datadir, instance, 2) | |||||
url_page3 = lister2.page_url(response2[-1]["id"]) | |||||
response3 = gitlab_page_response(datadir, instance, 3) | |||||
# in a real world scenario, incremental lister will list repositories whose | |||||
# have been modified since last listing date | |||||
last_activity_param = ( | |||||
f"last_activity_after={quote(lister2.state.last_listing_date)}" | |||||
) | |||||
assert last_activity_param in url_page2 | |||||
assert last_activity_param in url_page3 | |||||
requests_mock.get( | requests_mock.get( | ||||
url_page2, | url_page2, | ||||
[{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}], | [{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}], | ||||
additional_matcher=_match_request, | additional_matcher=_match_request, | ||||
) | ) | ||||
requests_mock.get( | requests_mock.get( | ||||
url_page3, | url_page3, | ||||
[{"json": response3}], | [{"json": response3}], | ||||
additional_matcher=_match_request, | additional_matcher=_match_request, | ||||
) | ) | ||||
listed_result2 = lister2.run() | listed_result2 = lister2.run() | ||||
assert listed_result2 == ListerStats( | assert listed_result2 == ListerStats( | ||||
pages=2, origins=len(response2) + len(response3) | pages=2, origins=len(response2) + len(response3) | ||||
) | ) | ||||
assert lister2.state.last_seen_next_link == url_page3 | assert lister2.state.last_listing_date == lister2.listing_date | ||||
assert lister.lister_obj.id == lister2.lister_obj.id | assert lister.lister_obj.id == lister2.lister_obj.id | ||||
scheduler_origins = lister2.scheduler.get_listed_origins( | scheduler_origins = lister2.scheduler.get_listed_origins( | ||||
lister2.lister_obj.id | lister2.lister_obj.id | ||||
).results | ).results | ||||
assert len(scheduler_origins) == len(response1) + len(response2) + len(response3) | assert len(scheduler_origins) == len(response1) + len(response2) + len(response3) | ||||
▲ Show 20 Lines • Show All 163 Lines • Show Last 20 Lines |