Changeset View
Standalone View
swh/lister/npm/lister.py
- This file was added.
# Copyright (C) 2018 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
from urllib.parse import quote | |||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister | |||||
from swh.lister.npm.models import NpmModel | |||||
from swh.scheduler.utils import create_task_dict | |||||
class NpmLister(SWHIndexingHttpLister): | |||||
"""List all packages available in the npm registry in a paginated way. | |||||
""" | |||||
PATH_TEMPLATE = '/_all_docs?startkey="%s"' | |||||
MODEL = NpmModel | |||||
LISTER_NAME = 'npm' | |||||
def __init__(self, api_baseurl='https://replicate.npmjs.com', | |||||
per_page=10000, override_config=None): | |||||
super().__init__(api_baseurl=api_baseurl, | |||||
override_config=override_config) | |||||
self.per_page = per_page + 1 | |||||
ardumont: why the +1? | |||||
Done Inline ActionsFor pagination purpose, see http://docs.couchdb.org/en/stable/ddocs/views/pagination.html#paging-alternate-method anlambert: For pagination purpose, see http://docs.couchdb.org/en/stable/ddocs/views/pagination. | |||||
Not Done Inline Actionslol, ok that's what i kinda did for the content_get_range endpoint [1] [1] https://forge.softwareheritage.org/source/swh-storage/browse/master/swh/storage/storage.py$278 ardumont: lol, ok that's what i kinda did for the content_get_range endpoint [1]
[1] https://forge. | |||||
self.PATH_TEMPLATE += '&limit=%s' % self.per_page | |||||
def get_model_from_repo(self, repo_name): | |||||
"""(Override) Transform from npm package name to model | |||||
""" | |||||
package_url, package_metadata_url = self._compute_urls(repo_name) | |||||
return { | |||||
'uid': repo_name, | |||||
'indexable': repo_name, | |||||
'name': repo_name, | |||||
'full_name': repo_name, | |||||
'html_url': package_metadata_url, | |||||
'origin_url': package_url, | |||||
'origin_type': 'npm', | |||||
'description': None | |||||
} | |||||
def task_dict(self, origin_type, origin_url, **kwargs): | |||||
"""(Override) Return task dict for loading a npm package into the archive | |||||
This is overridden from the lister_base as more information is | |||||
needed for the ingestion task creation. | |||||
""" | |||||
_type = 'origin-update-%s' % origin_type | |||||
_policy = 'recurring' | |||||
package_name = kwargs.get('name') | |||||
package_metadata_url = kwargs.get('html_url') | |||||
return create_task_dict(_type, _policy, package_name, origin_url, | |||||
package_metadata_url=package_metadata_url) | |||||
def get_next_target_from_response(self, response): | |||||
"""(Override) Get next npm package name to continue the listing | |||||
""" | |||||
repos = response.json()['rows'] | |||||
return repos[-1]['id'] if len(repos) == self.per_page else None | |||||
def transport_response_simplified(self, response): | |||||
Not Done Inline ActionsAs per latest discussion on other reviews and irc, it's better to keep only 1 return statement per method. Proposal: return repos[-1]['id'] if len(repos) == self.per_page else None ardumont: As per latest discussion on other reviews and irc, it's better to keep only 1 return statement… | |||||
Done Inline Actionsack anlambert: ack | |||||
"""(Override) Transform npm registry response to list for model manipulation | |||||
""" | |||||
repos = response.json()['rows'] | |||||
if len(repos) == self.per_page: | |||||
repos = repos[:-1] | |||||
return [self.get_model_from_repo(repo['id']) for repo in repos] | |||||
def request_headers(self): | |||||
"""(Override) Set requests headers to send when querying the npm registry | |||||
""" | |||||
return {'User-Agent': 'Software Heritage npm lister', | |||||
'Accept': 'application/json'} | |||||
def _compute_urls(self, repo_name): | |||||
"""Return a tuple (package_url, package_metadata_url) | |||||
""" | |||||
return ( | |||||
'https://www.npmjs.com/package/%s' % repo_name, | |||||
# package metadata url needs to be escaped otherwise some requests | |||||
# may fail (for instance when a package name contains '/') | |||||
'%s/%s' % (self.api_baseurl, quote(repo_name, safe='')) | |||||
) | |||||
def string_pattern_check(self, inner, lower, upper=None): | |||||
""" (Override) Inhibit the effect of that method as packages indices | |||||
correspond to package names and thus do not respect any kind | |||||
of fixed length string pattern | |||||
""" | |||||
pass | |||||
Not Done Inline ActionsIs that why the class does not declare a API_URL_INDEX_RE? ardumont: Is that why the class does not declare a API_URL_INDEX_RE? | |||||
Done Inline ActionsAPI_URL_INDEX_RE is only declared and used in the Github lister. For npm, there is no need to parse an url to get the next index as Nevertheless, I do not like this but I could not find an other solution in order [1] http://docs.couchdb.org/en/stable/ddocs/views/pagination.html#paging-alternate-method anlambert: API_URL_INDEX_RE is only declared and used in the Github lister.
For npm, there is no need to… | |||||
Done Inline ActionsFor the record, this is what I got when running the tests without overriding that method: self = <test_npm_lister.NpmListerTester testMethod=test_fetch_one_nodb>, http_mocker = <requests_mock.mocker.Mocker object at 0x7f73e5f84780> def test_fetch_one_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_storage_and_scheduler(fl) self.disable_db(fl) > fl.run(min_bound=self.first_index, max_bound=self.first_index) swh/lister/core/tests/test_lister.py:191: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ swh/lister/core/indexing_lister.py:176: in run response, injected_repos = self.ingest_data(index) swh/lister/core/lister_base.py:511: in ingest_data models_list = self.filter_before_inject(models_list) swh/lister/core/indexing_lister.py:68: in filter_before_inject m for m in models_list swh/lister/core/indexing_lister.py:69: in <listcomp> if self.is_within_bounds(m['indexable'], None, self.max_index) swh/lister/core/lister_base.py:198: in is_within_bounds self.string_pattern_check(inner, lower, upper) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <swh.lister.npm.lister.NpmLister object at 0x7f73deeade48>, a = 'jquery-1.8', b = None, c = 'jquery' def string_pattern_check(self, a, b, c=None): """When comparing indexable types in is_within_bounds, complex strings may not be allowed to differ in basic structure. If they do, it could be a sign of not understanding the data well. For instance, an ISO 8601 time string cannot be compared against its urlencoded equivalent, but this is an easy mistake to accidentally make. This method acts as a friendly sanity check. Args: a (string): inner component of the is_within_bounds method b (string): lower component of the is_within_bounds method c (string): upper component of the is_within_bounds method Returns: nothing Raises: TypeError if strings a, b, and c don't conform to the same basic pattern. """ if isinstance(a, str): a_pattern = re.sub('[a-zA-Z0-9]', '[a-zA-Z0-9]', re.escape(a)) if (isinstance(b, str) and (re.match(a_pattern, b) is None) or isinstance(c, str) and (re.match(a_pattern, c) is None)): logging.debug(a_pattern) > raise TypeError('incomparable string patterns detected') E TypeError: incomparable string patterns detected swh/lister/core/lister_base.py:437: TypeError ------------------------------------------------------------------------------------------------------------------------------ Captured log call ------------------------------------------------------------------------------------------------------------------------------ lister_base.py 203 ERROR incomparable string patterns detected: inner=<class 'str'>jquery-1.8, lower=<class 'NoneType'>None, upper=<class 'str'>jquery anlambert: For the record, this is what I got when running the tests without overriding that method:
```… | |||||
Not Done Inline Actions
ok, thanks. No biggie, just wondering ;)
Huh, right. ardumont: > API_URL_INDEX_RE is only declared and used in the Github lister.
ok, thanks. No biggie, just… |
why the +1?