Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/bitbucket/tests/test_lister.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import timedelta | from datetime import datetime | ||||
import re | import json | ||||
import unittest | import os | ||||
from urllib.parse import unquote | |||||
import iso8601 | import pytest | ||||
import requests_mock | from requests import Response | ||||
from swh.lister.bitbucket.lister import BitBucketLister | from swh.lister.bitbucket.lister import BitbucketLister | ||||
from swh.lister.core.tests.test_lister import HttpListerTester | |||||
def _convert_type(req_index): | @pytest.fixture | ||||
"""Convert the req_index to its right type according to the model's | def bb_api_repositories_page1(datadir): | ||||
"indexable" column. | data_file_path = os.path.join(datadir, "bb_api_repositories_page1.json") | ||||
with open(data_file_path, "r") as data_file: | |||||
return json.load(data_file) | |||||
""" | |||||
return iso8601.parse_date(unquote(req_index)) | |||||
@pytest.fixture | |||||
def bb_api_repositories_page2(datadir): | |||||
data_file_path = os.path.join(datadir, "bb_api_repositories_page2.json") | |||||
with open(data_file_path, "r") as data_file: | |||||
return json.load(data_file) | |||||
class BitBucketListerTester(HttpListerTester, unittest.TestCase): | |||||
Lister = BitBucketLister | |||||
test_re = re.compile(r"/repositories\?after=([^?&]+)") | |||||
lister_subdir = "bitbucket" | |||||
good_api_response_file = "data/https_api.bitbucket.org/response.json" | |||||
bad_api_response_file = "data/https_api.bitbucket.org/empty_response.json" | |||||
first_index = _convert_type("2008-07-12T07:44:01.476818+00:00") | |||||
last_index = _convert_type("2008-07-19T06:16:43.044743+00:00") | |||||
entries_per_page = 10 | |||||
convert_type = _convert_type | |||||
def request_index(self, request): | def check_listed_origins(lister_origins, scheduler_origins): | ||||
"""(Override) This is needed to emulate the listing bootstrap | """Asserts that the two collections have the same origins from the point of view of | ||||
when no min_bound is provided to run | the lister""" | ||||
""" | |||||
m = self.test_re.search(request.path_url) | |||||
idx = _convert_type(m.group(1)) | |||||
if idx == self.Lister.default_min_bound: | |||||
idx = self.first_index | |||||
return idx | |||||
@requests_mock.Mocker() | sorted_lister_origins = list(sorted(lister_origins)) | ||||
def test_fetch_none_nodb(self, http_mocker): | sorted_scheduler_origins = list(sorted(scheduler_origins)) | ||||
"""Overridden because index is not an integer nor a string | |||||
""" | assert len(sorted_lister_origins) == len(sorted_scheduler_origins) | ||||
http_mocker.get(self.test_re, text=self.mock_response) | |||||
fl = self.get_fl() | |||||
self.disable_scheduler(fl) | for lo, so in zip(sorted_lister_origins, sorted_scheduler_origins): | ||||
self.disable_db(fl) | assert lo.url == so.url | ||||
assert lo.last_update == so.last_update | |||||
# stores no results | |||||
fl.run( | |||||
min_bound=self.first_index - timedelta(days=3), max_bound=self.first_index | |||||
) | |||||
def test_is_within_bounds(self): | def test_bitbucket_incremental_lister( | ||||
fl = self.get_fl() | swh_scheduler, | ||||
self.assertTrue( | requests_mock, | ||||
fl.is_within_bounds( | mocker, | ||||
iso8601.parse_date("2008-07-15"), self.first_index, self.last_index | bb_api_repositories_page1, | ||||
) | bb_api_repositories_page2, | ||||
) | ): | ||||
self.assertFalse( | """Simple Bitbucket listing with two pages containing 10 origins""" | ||||
fl.is_within_bounds( | |||||
iso8601.parse_date("2008-07-20"), self.first_index, self.last_index | requests_mock.get( | ||||
BitbucketLister.API_URL, | |||||
[{"json": bb_api_repositories_page1}, {"json": bb_api_repositories_page2},], | |||||
) | ) | ||||
lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) | |||||
# First listing | |||||
stats = lister.run() | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins | |||||
assert stats.pages == 2 | |||||
assert stats.origins == 20 | |||||
assert len(scheduler_origins) == 20 | |||||
assert lister.updated | |||||
lister_state = lister.get_state_from_scheduler() | |||||
last_repo_cdate = lister_state.last_repo_cdate.isoformat() | |||||
assert hasattr(lister_state, "last_repo_cdate") | |||||
assert last_repo_cdate == bb_api_repositories_page2["values"][-1]["created_on"] | |||||
# Second listing, restarting from last state | |||||
mock_session = mocker.patch.object(lister, "session", autospec=True) | |||||
mock_session.get.return_value = Response() | |||||
lister.run() | |||||
url_params = lister.url_params | |||||
url_params["after"] = last_repo_cdate | |||||
mock_session.get.assert_called_once_with(lister.API_URL, params=url_params) | |||||
all_origins = ( | |||||
bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"] | |||||
) | ) | ||||
self.assertFalse( | |||||
fl.is_within_bounds( | check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins) | ||||
iso8601.parse_date("2008-07-11"), self.first_index, self.last_index | |||||
def test_bitbucket_lister_rate_limit_hit( | |||||
swh_scheduler, | |||||
requests_mock, | |||||
mocker, | |||||
bb_api_repositories_page1, | |||||
bb_api_repositories_page2, | |||||
): | |||||
"""Simple Bitbucket listing with two pages containing 10 origins""" | |||||
mock_sleep = mocker.patch("swh.lister.bitbucket.lister.time.sleep") | |||||
requests_mock.get( | |||||
BitbucketLister.API_URL, | |||||
[ | |||||
{"json": bb_api_repositories_page1, "status_code": 200}, | |||||
{"json": None, "status_code": 429}, | |||||
{"json": None, "status_code": 429}, | |||||
{"json": bb_api_repositories_page2, "status_code": 200}, | |||||
], | |||||
) | ) | ||||
lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) | |||||
stats = lister.run() | |||||
mock_sleep.assert_has_calls( | |||||
[ | |||||
mocker.call(lister.BACKOFF_FACTOR), | |||||
mocker.call(lister.BACKOFF_FACTOR * lister.BACKOFF_FACTOR), | |||||
] | |||||
) | ) | ||||
assert stats.pages == 2 | |||||
assert stats.origins == 20 | |||||
assert len(swh_scheduler.get_listed_origins(lister.lister_obj.id).origins) == 20 | |||||
def test_lister_bitbucket(lister_bitbucket, requests_mock_datadir): | def test_bitbucket_full_lister( | ||||
"""Simple bitbucket listing should create scheduled tasks (git, hg) | swh_scheduler, | ||||
requests_mock, | |||||
mocker, | |||||
bb_api_repositories_page1, | |||||
bb_api_repositories_page2, | |||||
): | |||||
"""Simple Bitbucket listing with two pages containing 10 origins""" | |||||
""" | requests_mock.get( | ||||
lister_bitbucket.run() | BitbucketLister.API_URL, | ||||
[ | |||||
{"json": bb_api_repositories_page1}, | |||||
{"json": bb_api_repositories_page2}, | |||||
{"json": bb_api_repositories_page1}, | |||||
{"json": bb_api_repositories_page2}, | |||||
], | |||||
) | |||||
lister = BitbucketLister(scheduler=swh_scheduler, page_size=10, incremental=True) | |||||
r = lister_bitbucket.scheduler.search_tasks(task_type="load-hg") | # First do a incremental run to have an initial lister state | ||||
assert len(r) == 9 | stats = lister.run() | ||||
last_lister_state = lister.get_state_from_scheduler() | |||||
assert stats.origins == 20 | |||||
for row in r: | # Then do the full run and verify lister state did not change | ||||
args = row["arguments"]["args"] | # Modify last listed repo modification date to check it will be not saved | ||||
kwargs = row["arguments"]["kwargs"] | # to lister state after its execution | ||||
last_page2_repo = bb_api_repositories_page2["values"][-1] | |||||
last_page2_repo["created_on"] = datetime.now().isoformat() | |||||
last_page2_repo["updated_on"] = datetime.now().isoformat() | |||||
lister.set_incremental(False) | |||||
assert len(args) == 0 | stats = lister.run() | ||||
assert len(kwargs) == 1 | |||||
url = kwargs["url"] | |||||
vlorentz: isn't this a method? | |||||
Done Inline Actionsoops, will fix this. tenma: oops, will fix this. | |||||
assert url.startswith("https://bitbucket.org") | assert lister.commit_page | ||||
assert stats.origins == 20 | |||||
assert row["policy"] == "recurring" | scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins | ||||
assert row["priority"] is None | |||||
r = lister_bitbucket.scheduler.search_tasks(task_type="load-git") | # 20 because scheduler upserts based on (id, type, url) | ||||
assert len(r) == 1 | assert len(scheduler_origins) == 20 | ||||
for row in r: | # Modification on created_on SHOULD NOT impact lister state | ||||
args = row["arguments"]["args"] | assert lister.get_state_from_scheduler() == last_lister_state | ||||
kwargs = row["arguments"]["kwargs"] | |||||
assert len(args) == 0 | |||||
assert len(kwargs) == 1 | |||||
url = kwargs["url"] | |||||
assert url.startswith("https://bitbucket.org") | # Modification on updated_on SHOULD impact lister state | ||||
all_origins = ( | |||||
bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"] | |||||
) | |||||
assert row["policy"] == "recurring" | check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins) | ||||
assert row["priority"] is None |
isn't this a method?