diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index 89eda25..8560e66 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,166 +1,150 @@ # Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from urllib.parse import quote - from swh.lister.core.indexing_lister import IndexingHttpLister from swh.lister.npm.models import NpmModel from swh.scheduler.utils import create_task_dict class NpmListerBase(IndexingHttpLister): """List packages available in the npm registry in a paginated way """ MODEL = NpmModel LISTER_NAME = 'npm' instance = 'npm' def __init__(self, url='https://replicate.npmjs.com', per_page=1000, override_config=None): super().__init__(url=url, override_config=override_config) self.per_page = per_page + 1 self.PATH_TEMPLATE += '&limit=%s' % self.per_page @property def ADDITIONAL_CONFIG(self): """(Override) Add extra configuration """ default_config = super().ADDITIONAL_CONFIG default_config['loading_task_policy'] = ('str', 'recurring') return default_config def get_model_from_repo(self, repo_name): """(Override) Transform from npm package name to model """ - package_url, package_metadata_url = self._compute_urls(repo_name) + package_url = 'https://www.npmjs.com/package/%s' % repo_name return { 'uid': repo_name, 'indexable': repo_name, 'name': repo_name, 'full_name': repo_name, - 'html_url': package_metadata_url, + 'html_url': package_url, 'origin_url': package_url, 'origin_type': 'npm', } def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task dict for loading a npm package into the archive. This is overridden from the lister_base as more information is needed for the ingestion task creation. """ task_type = 'load-%s' % origin_type task_policy = self.config['loading_task_policy'] - package_name = kwargs.get('name') - package_metadata_url = kwargs.get('html_url') return create_task_dict(task_type, task_policy, - package_name=package_name, - package_url=origin_url, - package_metadata_url=package_metadata_url) + url=origin_url) def request_headers(self): """(Override) Set requests headers to send when querying the npm registry. """ headers = super().request_headers() headers['Accept'] = 'application/json' return headers - def _compute_urls(self, repo_name): - """Return a tuple (package_url, package_metadata_url) - """ - return ( - 'https://www.npmjs.com/package/%s' % repo_name, - # package metadata url needs to be escaped otherwise some requests - # may fail (for instance when a package name contains '/') - '%s/%s' % (self.url, quote(repo_name, safe='')) - ) - def string_pattern_check(self, inner, lower, upper=None): """ (Override) Inhibit the effect of that method as packages indices correspond to package names and thus do not respect any kind of fixed length string pattern """ pass class NpmLister(NpmListerBase): """List all packages available in the npm registry in a paginated way """ PATH_TEMPLATE = '/_all_docs?startkey="%s"' def get_next_target_from_response(self, response): """(Override) Get next npm package name to continue the listing """ repos = response.json()['rows'] return repos[-1]['id'] if len(repos) == self.per_page else None def transport_response_simplified(self, response): """(Override) Transform npm registry response to list for model manipulation """ repos = response.json()['rows'] if len(repos) == self.per_page: repos = repos[:-1] return [self.get_model_from_repo(repo['id']) for repo in repos] class NpmIncrementalLister(NpmListerBase): """List packages in the npm registry, updated since a specific update_seq value of the underlying CouchDB database, in a paginated way. """ PATH_TEMPLATE = '/_changes?since=%s' @property def CONFIG_BASE_FILENAME(self): # noqa: N802 return 'lister_npm_incremental' def get_next_target_from_response(self, response): """(Override) Get next npm package name to continue the listing. """ repos = response.json()['results'] return repos[-1]['seq'] if len(repos) == self.per_page else None def transport_response_simplified(self, response): """(Override) Transform npm registry response to list for model manipulation. """ repos = response.json()['results'] if len(repos) == self.per_page: repos = repos[:-1] return [self.get_model_from_repo(repo['id']) for repo in repos] def filter_before_inject(self, models_list): """(Override) Filter out documents in the CouchDB database not related to a npm package. """ models_filtered = [] for model in models_list: package_name = model['name'] # document related to CouchDB internals if package_name.startswith('_design/'): continue models_filtered.append(model) return models_filtered def disable_deleted_repo_tasks(self, start, end, keep_these): """(Override) Disable the processing performed by that method as it is not relevant in this incremental lister context. It also raises an exception due to a different index type (int instead of str). """ pass diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py index 9888795..2a7ed8d 100644 --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -1,100 +1,98 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import re import requests_mock import unittest from swh.lister.core.tests.test_lister import HttpListerTesterBase from swh.lister.npm.lister import NpmLister, NpmIncrementalLister from typing import Any, List logger = logging.getLogger(__name__) class NpmListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmLister test_re = re.compile(r'^.*/_all_docs\?startkey="(.+)".*') lister_subdir = 'npm' good_api_response_file = 'data/replicate.npmjs.com/api_response.json' bad_api_response_file = 'data/api_empty_response.json' first_index = 'jquery' entries_per_page = 100 @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): # disable this test from HttpListerTesterBase as # it can not succeed for the npm lister due to the # overriding of the string_pattern_check method pass class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmIncrementalLister test_re = re.compile(r'^.*/_changes\?since=([0-9]+).*') lister_subdir = 'npm' good_api_response_file = 'data/api_inc_response.json' bad_api_response_file = 'data/api_inc_empty_response.json' first_index = '6920642' entries_per_page = 100 @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): # disable this test from HttpListerTesterBase as # it can not succeed for the npm lister due to the # overriding of the string_pattern_check method pass def check_tasks(tasks: List[Any]): """Ensure scheduled tasks are in the expected format. """ for row in tasks: logger.debug('row: %s', row) assert row['type'] == 'load-npm' # arguments check args = row['arguments']['args'] assert len(args) == 0 # kwargs kwargs = row['arguments']['kwargs'] - assert len(kwargs) == 3 - package_name = kwargs['package_name'] - package_url = kwargs['package_url'] - assert package_url == 'https://www.npmjs.com/package/%s' % package_name - meta_url = kwargs['package_metadata_url'] - assert meta_url == 'https://replicate.npmjs.com/%s' % package_name + assert len(kwargs) == 1 + package_url = kwargs['url'] + package_name = package_url.split('/')[-1] + assert package_url == f'https://www.npmjs.com/package/{package_name}' assert row['policy'] == 'recurring' assert row['priority'] is None def test_lister_npm_basic_listing(lister_npm, requests_mock_datadir): lister_npm.run() tasks = lister_npm.scheduler.search_tasks(task_type='load-npm') assert len(tasks) == 100 check_tasks(tasks) def test_lister_npm_listing_pagination(lister_npm, requests_mock_datadir): lister = lister_npm # Patch per page pagination lister.per_page = 10 + 1 lister.PATH_TEMPLATE = lister.PATH_TEMPLATE.replace( '&limit=1001', '&limit=%s' % lister.per_page) lister.run() tasks = lister.scheduler.search_tasks(task_type='load-npm') assert len(tasks) == 2 * 10 # only 2 files with 10 results each check_tasks(tasks)