Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/npm/lister.py
# Copyright (C) 2018-2019 the Software Heritage developers | # Copyright (C) 2018-2019 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from urllib.parse import quote | |||||
from swh.lister.core.indexing_lister import IndexingHttpLister | from swh.lister.core.indexing_lister import IndexingHttpLister | ||||
from swh.lister.npm.models import NpmModel | from swh.lister.npm.models import NpmModel | ||||
from swh.scheduler.utils import create_task_dict | from swh.scheduler.utils import create_task_dict | ||||
class NpmListerBase(IndexingHttpLister): | class NpmListerBase(IndexingHttpLister): | ||||
"""List packages available in the npm registry in a paginated way | """List packages available in the npm registry in a paginated way | ||||
Show All 16 Lines | def ADDITIONAL_CONFIG(self): | ||||
default_config = super().ADDITIONAL_CONFIG | default_config = super().ADDITIONAL_CONFIG | ||||
default_config['loading_task_policy'] = ('str', 'recurring') | default_config['loading_task_policy'] = ('str', 'recurring') | ||||
return default_config | return default_config | ||||
def get_model_from_repo(self, repo_name): | def get_model_from_repo(self, repo_name): | ||||
"""(Override) Transform from npm package name to model | """(Override) Transform from npm package name to model | ||||
""" | """ | ||||
package_url, package_metadata_url = self._compute_urls(repo_name) | package_url = 'https://www.npmjs.com/package/%s' % repo_name | ||||
return { | return { | ||||
'uid': repo_name, | 'uid': repo_name, | ||||
'indexable': repo_name, | 'indexable': repo_name, | ||||
'name': repo_name, | 'name': repo_name, | ||||
'full_name': repo_name, | 'full_name': repo_name, | ||||
'html_url': package_metadata_url, | 'html_url': package_url, | ||||
'origin_url': package_url, | 'origin_url': package_url, | ||||
'origin_type': 'npm', | 'origin_type': 'npm', | ||||
} | } | ||||
def task_dict(self, origin_type, origin_url, **kwargs): | def task_dict(self, origin_type, origin_url, **kwargs): | ||||
"""(Override) Return task dict for loading a npm package into the | """(Override) Return task dict for loading a npm package into the | ||||
archive. | archive. | ||||
This is overridden from the lister_base as more information is | This is overridden from the lister_base as more information is | ||||
needed for the ingestion task creation. | needed for the ingestion task creation. | ||||
""" | """ | ||||
task_type = 'load-%s' % origin_type | task_type = 'load-%s' % origin_type | ||||
task_policy = self.config['loading_task_policy'] | task_policy = self.config['loading_task_policy'] | ||||
package_name = kwargs.get('name') | |||||
package_metadata_url = kwargs.get('html_url') | |||||
return create_task_dict(task_type, task_policy, | return create_task_dict(task_type, task_policy, | ||||
package_name=package_name, | url=origin_url) | ||||
anlambert: trailing comma here | |||||
package_url=origin_url, | |||||
package_metadata_url=package_metadata_url) | |||||
def request_headers(self): | def request_headers(self): | ||||
"""(Override) Set requests headers to send when querying the npm | """(Override) Set requests headers to send when querying the npm | ||||
registry. | registry. | ||||
""" | """ | ||||
headers = super().request_headers() | headers = super().request_headers() | ||||
headers['Accept'] = 'application/json' | headers['Accept'] = 'application/json' | ||||
return headers | return headers | ||||
def _compute_urls(self, repo_name): | |||||
"""Return a tuple (package_url, package_metadata_url) | |||||
""" | |||||
return ( | |||||
'https://www.npmjs.com/package/%s' % repo_name, | |||||
# package metadata url needs to be escaped otherwise some requests | |||||
# may fail (for instance when a package name contains '/') | |||||
'%s/%s' % (self.url, quote(repo_name, safe='')) | |||||
) | |||||
def string_pattern_check(self, inner, lower, upper=None): | def string_pattern_check(self, inner, lower, upper=None): | ||||
""" (Override) Inhibit the effect of that method as packages indices | """ (Override) Inhibit the effect of that method as packages indices | ||||
correspond to package names and thus do not respect any kind | correspond to package names and thus do not respect any kind | ||||
of fixed length string pattern | of fixed length string pattern | ||||
""" | """ | ||||
pass | pass | ||||
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines |
trailing comma here