Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/npm/tests/test_lister.py
# Copyright (C) 2018-2019 The Software Heritage developers | # Copyright (C) 2018-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | from itertools import chain | ||||
import re | import json | ||||
from typing import Any, List | from pathlib import Path | ||||
import unittest | |||||
import iso8601 | |||||
import requests_mock | import pytest | ||||
from requests.exceptions import HTTPError | |||||
from swh.lister.core.tests.test_lister import HttpListerTesterBase | |||||
from swh.lister.npm.lister import NpmIncrementalLister, NpmLister | from swh.lister import USER_AGENT | ||||
from swh.lister.npm.lister import NpmLister, NpmListerState | |||||
logger = logging.getLogger(__name__) | |||||
@pytest.fixture | |||||
class NpmListerTester(HttpListerTesterBase, unittest.TestCase): | def npm_full_listing_page1(datadir): | ||||
Lister = NpmLister | return json.loads(Path(datadir, "npm_full_page1.json").read_text()) | ||||
test_re = re.compile(r'^.*/_all_docs\?startkey="(.+)".*') | |||||
lister_subdir = "npm" | |||||
good_api_response_file = "data/replicate.npmjs.com/api_response.json" | @pytest.fixture | ||||
bad_api_response_file = "data/api_empty_response.json" | def npm_full_listing_page2(datadir): | ||||
first_index = "jquery" | return json.loads(Path(datadir, "npm_full_page2.json").read_text()) | ||||
entries_per_page = 100 | |||||
@requests_mock.Mocker() | @pytest.fixture | ||||
def test_is_within_bounds(self, http_mocker): | def npm_incremental_listing_page1(datadir): | ||||
# disable this test from HttpListerTesterBase as | return json.loads(Path(datadir, "npm_incremental_page1.json").read_text()) | ||||
# it can not succeed for the npm lister due to the | |||||
# overriding of the string_pattern_check method | |||||
pass | @pytest.fixture | ||||
def npm_incremental_listing_page2(datadir): | |||||
return json.loads(Path(datadir, "npm_incremental_page2.json").read_text()) | |||||
class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase): | |||||
Lister = NpmIncrementalLister | |||||
test_re = re.compile(r"^.*/_changes\?since=([0-9]+).*") | def _check_listed_npm_packages(lister, packages, scheduler_origins): | ||||
lister_subdir = "npm" | for package in packages: | ||||
good_api_response_file = "data/api_inc_response.json" | package_name = package["doc"]["name"] | ||||
bad_api_response_file = "data/api_inc_empty_response.json" | latest_version = package["doc"]["dist-tags"]["latest"] | ||||
first_index = "6920642" | package_last_update = iso8601.parse_date(package["doc"]["time"][latest_version]) | ||||
entries_per_page = 100 | origin_url = lister.PACKAGE_URL_TEMPLATE.format(package_name=package_name) | ||||
@requests_mock.Mocker() | scheduler_origin = [o for o in scheduler_origins if o.url == origin_url] | ||||
def test_is_within_bounds(self, http_mocker): | assert scheduler_origin | ||||
# disable this test from HttpListerTesterBase as | assert scheduler_origin[0].last_update == package_last_update | ||||
# it can not succeed for the npm lister due to the | |||||
# overriding of the string_pattern_check method | |||||
pass | def _match_request(request): | ||||
return request.headers.get("User-Agent") == USER_AGENT | |||||
def check_tasks(tasks: List[Any]): | |||||
"""Ensure scheduled tasks are in the expected format. | def _url_params(page_size, **kwargs): | ||||
params = {"limit": page_size, "include_docs": "true"} | |||||
params.update(**kwargs) | |||||
""" | return params | ||||
for row in tasks: | |||||
logger.debug("row: %s", row) | |||||
assert row["type"] == "load-npm" | def test_npm_lister_full( | ||||
# arguments check | swh_scheduler, requests_mock, mocker, npm_full_listing_page1, npm_full_listing_page2 | ||||
args = row["arguments"]["args"] | ): | ||||
assert len(args) == 0 | """Simulate a full listing of four npm packages in two pages""" | ||||
page_size = 2 | |||||
# kwargs | lister = NpmLister(scheduler=swh_scheduler, page_size=page_size, incremental=False) | ||||
kwargs = row["arguments"]["kwargs"] | |||||
assert len(kwargs) == 1 | requests_mock.get( | ||||
package_url = kwargs["url"] | lister.API_FULL_LISTING_URL, | ||||
package_name = package_url.split("/")[-1] | [{"json": npm_full_listing_page1}, {"json": npm_full_listing_page2},], | ||||
assert package_url == f"https://www.npmjs.com/package/{package_name}" | additional_matcher=_match_request, | ||||
) | |||||
assert row["policy"] == "recurring" | |||||
assert row["priority"] is None | spy_get = mocker.spy(lister.session, "get") | ||||
stats = lister.run() | |||||
def test_npm_lister_basic_listing(lister_npm, requests_mock_datadir): | assert stats.pages == 2 | ||||
lister_npm.run() | assert stats.origins == page_size * stats.pages | ||||
tasks = lister_npm.scheduler.search_tasks(task_type="load-npm") | spy_get.assert_has_calls( | ||||
assert len(tasks) == 100 | [ | ||||
mocker.call( | |||||
check_tasks(tasks) | lister.API_FULL_LISTING_URL, | ||||
params=_url_params(page_size + 1, startkey='""'), | |||||
), | |||||
def test_npm_lister_listing_pagination(lister_npm, requests_mock_datadir): | mocker.call( | ||||
lister = lister_npm | lister.API_FULL_LISTING_URL, | ||||
# Patch per page pagination | params=_url_params( | ||||
lister.per_page = 10 + 1 | page_size + 1, | ||||
lister.PATH_TEMPLATE = lister.PATH_TEMPLATE.replace( | startkey=f'"{npm_full_listing_page1["rows"][-1]["id"]}"', | ||||
"&limit=1001", "&limit=%s" % lister.per_page | ), | ||||
), | |||||
] | |||||
) | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins | |||||
_check_listed_npm_packages( | |||||
lister, | |||||
chain(npm_full_listing_page1["rows"][:-1], npm_full_listing_page2["rows"]), | |||||
scheduler_origins, | |||||
) | |||||
assert lister.get_state_from_scheduler() == NpmListerState() | |||||
def test_npm_lister_incremental( | |||||
swh_scheduler, | |||||
requests_mock, | |||||
mocker, | |||||
npm_incremental_listing_page1, | |||||
npm_incremental_listing_page2, | |||||
): | |||||
"""Simulate an incremental listing of four npm packages in two pages""" | |||||
page_size = 2 | |||||
lister = NpmLister(scheduler=swh_scheduler, page_size=page_size, incremental=True) | |||||
requests_mock.get( | |||||
lister.API_INCREMENTAL_LISTING_URL, | |||||
[ | |||||
{"json": npm_incremental_listing_page1}, | |||||
{"json": npm_incremental_listing_page2}, | |||||
{"json": {"results": []}}, | |||||
], | |||||
additional_matcher=_match_request, | |||||
) | |||||
spy_get = mocker.spy(lister.session, "get") | |||||
assert lister.get_state_from_scheduler() == NpmListerState() | |||||
stats = lister.run() | |||||
assert stats.pages == 2 | |||||
assert stats.origins == page_size * stats.pages | |||||
last_seq = npm_incremental_listing_page2["results"][-1]["seq"] | |||||
spy_get.assert_has_calls( | |||||
[ | |||||
mocker.call( | |||||
lister.API_INCREMENTAL_LISTING_URL, | |||||
params=_url_params(page_size, since="0"), | |||||
), | |||||
mocker.call( | |||||
lister.API_INCREMENTAL_LISTING_URL, | |||||
params=_url_params( | |||||
page_size, | |||||
since=str(npm_incremental_listing_page1["results"][-1]["seq"]), | |||||
), | |||||
), | |||||
mocker.call( | |||||
lister.API_INCREMENTAL_LISTING_URL, | |||||
params=_url_params(page_size, since=str(last_seq)), | |||||
), | |||||
] | |||||
) | |||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins | |||||
_check_listed_npm_packages( | |||||
lister, | |||||
chain( | |||||
npm_incremental_listing_page1["results"], | |||||
npm_incremental_listing_page2["results"], | |||||
), | |||||
scheduler_origins, | |||||
) | ) | ||||
assert lister.get_state_from_scheduler() == NpmListerState(last_seq=last_seq) | |||||
def test_npm_lister_incremental_restart( | |||||
swh_scheduler, requests_mock, mocker, | |||||
): | |||||
"""Check incremental npm listing will restart from saved state""" | |||||
page_size = 2 | |||||
last_seq = 67 | |||||
lister = NpmLister(scheduler=swh_scheduler, page_size=page_size, incremental=True) | |||||
lister.state = NpmListerState(last_seq=last_seq) | |||||
requests_mock.get(lister.API_INCREMENTAL_LISTING_URL, json={"results": []}) | |||||
spy_get = mocker.spy(lister.session, "get") | |||||
lister.run() | lister.run() | ||||
tasks = lister.scheduler.search_tasks(task_type="load-npm") | spy_get.assert_called_with( | ||||
assert len(tasks) == 2 * 10 # only 2 files with 10 results each | lister.API_INCREMENTAL_LISTING_URL, | ||||
params=_url_params(page_size, since=str(last_seq)), | |||||
) | |||||
check_tasks(tasks) | def test_npm_lister_http_error( | ||||
swh_scheduler, requests_mock, mocker, | |||||
): | |||||
lister = NpmLister(scheduler=swh_scheduler) | |||||
requests_mock.get(lister.API_FULL_LISTING_URL, status_code=500) | |||||
with pytest.raises(HTTPError): | |||||
lister.run() |