Changeset View
Standalone View
swh/lister/npm/lister.py
# Copyright (C) 2018 the Software Heritage developers | # Copyright (C) 2018 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from urllib.parse import quote | from urllib.parse import quote | ||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister | from swh.lister.core.indexing_lister import SWHIndexingHttpLister | ||||
from swh.lister.npm.models import NpmModel | from swh.lister.npm.models import NpmModel | ||||
from swh.scheduler.utils import create_task_dict | from swh.scheduler.utils import create_task_dict | ||||
class NpmLister(SWHIndexingHttpLister): | class NpmListerBase(SWHIndexingHttpLister): | ||||
"""List all packages available in the npm registry in a paginated way. | """List packages available in the npm registry in a paginated way | ||||
""" | """ | ||||
PATH_TEMPLATE = '/_all_docs?startkey="%s"' | |||||
MODEL = NpmModel | MODEL = NpmModel | ||||
LISTER_NAME = 'npm' | LISTER_NAME = 'npm' | ||||
def __init__(self, api_baseurl='https://replicate.npmjs.com', | def __init__(self, api_baseurl='https://replicate.npmjs.com', | ||||
per_page=10000, override_config=None): | per_page=1000, override_config=None): | ||||
super().__init__(api_baseurl=api_baseurl, | super().__init__(api_baseurl=api_baseurl, | ||||
override_config=override_config) | override_config=override_config) | ||||
self.per_page = per_page + 1 | self.per_page = per_page + 1 | ||||
self.PATH_TEMPLATE += '&limit=%s' % self.per_page | self.PATH_TEMPLATE += '&limit=%s' % self.per_page | ||||
@property | |||||
def ADDITIONAL_CONFIG(self): | |||||
"""(Override) Add extra configuration | |||||
""" | |||||
default_config = super().ADDITIONAL_CONFIG | |||||
default_config['loading_task_policy'] = ('str', 'recurring') | |||||
return default_config | |||||
def get_model_from_repo(self, repo_name): | def get_model_from_repo(self, repo_name): | ||||
"""(Override) Transform from npm package name to model | """(Override) Transform from npm package name to model | ||||
""" | """ | ||||
package_url, package_metadata_url = self._compute_urls(repo_name) | package_url, package_metadata_url = self._compute_urls(repo_name) | ||||
return { | return { | ||||
'uid': repo_name, | 'uid': repo_name, | ||||
'indexable': repo_name, | 'indexable': repo_name, | ||||
'name': repo_name, | 'name': repo_name, | ||||
'full_name': repo_name, | 'full_name': repo_name, | ||||
'html_url': package_metadata_url, | 'html_url': package_metadata_url, | ||||
'origin_url': package_url, | 'origin_url': package_url, | ||||
'origin_type': 'npm', | 'origin_type': 'npm', | ||||
'description': None | 'description': None | ||||
} | } | ||||
def task_dict(self, origin_type, origin_url, **kwargs): | def task_dict(self, origin_type, origin_url, **kwargs): | ||||
"""(Override) Return task dict for loading a npm package into the archive | """(Override) Return task dict for loading a npm package into the archive | ||||
This is overridden from the lister_base as more information is | This is overridden from the lister_base as more information is | ||||
needed for the ingestion task creation. | needed for the ingestion task creation. | ||||
""" | """ | ||||
_type = 'origin-update-%s' % origin_type | task_type = 'origin-update-%s' % origin_type | ||||
_policy = 'recurring' | task_policy = self.config['loading_task_policy'] | ||||
ardumont: Why oneshot?
| |||||
Done Inline ActionsIn order to only create loading tasks when it is needed. The npm registry is pretty huge (more than 800 000 packages in it) so to avoid having too much recurring tasks, I think the approach "ingest all packages once then ingest only those with updates at the next listing" is better here. anlambert: In order to only create loading tasks when it is needed. The npm registry is pretty huge (more… | |||||
Not Done Inline ActionsI'm not completely convinced. It's my understanding that the javascript world moves a lot. Won't that create a lot of oneshot tasks anyway? Another question pops up. What's a package? ardumont: I'm not completely convinced.
We are far from the 80M from the git repositories for example ;)… | |||||
Done Inline Actions
Surely, but less than having recurring tasks for all available packages. From my point of view, if we can benefit from a listing
In the npm semantics, a package is a project so you can see it as a group of source code archives (1 per version). anlambert: > It's my understanding that the javascript world moves a lot. Won't that create a lot of… | |||||
Not Done Inline Actions
Now, i'm sold \m/ ardumont: > Surely, but less than having recurring tasks for all available packages. From my point of… | |||||
package_name = kwargs.get('name') | package_name = kwargs.get('name') | ||||
package_metadata_url = kwargs.get('html_url') | package_metadata_url = kwargs.get('html_url') | ||||
return create_task_dict(_type, _policy, package_name, origin_url, | return create_task_dict(task_type, task_policy, | ||||
package_name, origin_url, | |||||
package_metadata_url=package_metadata_url) | package_metadata_url=package_metadata_url) | ||||
def get_next_target_from_response(self, response): | |||||
"""(Override) Get next npm package name to continue the listing | |||||
""" | |||||
repos = response.json()['rows'] | |||||
return repos[-1]['id'] if len(repos) == self.per_page else None | |||||
def transport_response_simplified(self, response): | |||||
"""(Override) Transform npm registry response to list for model manipulation | |||||
""" | |||||
repos = response.json()['rows'] | |||||
if len(repos) == self.per_page: | |||||
repos = repos[:-1] | |||||
return [self.get_model_from_repo(repo['id']) for repo in repos] | |||||
def request_headers(self): | def request_headers(self): | ||||
"""(Override) Set requests headers to send when querying the npm registry | """(Override) Set requests headers to send when querying the npm registry | ||||
""" | """ | ||||
return {'User-Agent': 'Software Heritage npm lister', | return {'User-Agent': 'Software Heritage npm lister', | ||||
'Accept': 'application/json'} | 'Accept': 'application/json'} | ||||
def _compute_urls(self, repo_name): | def _compute_urls(self, repo_name): | ||||
"""Return a tuple (package_url, package_metadata_url) | """Return a tuple (package_url, package_metadata_url) | ||||
""" | """ | ||||
return ( | return ( | ||||
'https://www.npmjs.com/package/%s' % repo_name, | 'https://www.npmjs.com/package/%s' % repo_name, | ||||
# package metadata url needs to be escaped otherwise some requests | # package metadata url needs to be escaped otherwise some requests | ||||
# may fail (for instance when a package name contains '/') | # may fail (for instance when a package name contains '/') | ||||
'%s/%s' % (self.api_baseurl, quote(repo_name, safe='')) | '%s/%s' % (self.api_baseurl, quote(repo_name, safe='')) | ||||
) | ) | ||||
def string_pattern_check(self, inner, lower, upper=None): | def string_pattern_check(self, inner, lower, upper=None): | ||||
""" (Override) Inhibit the effect of that method as packages indices | """ (Override) Inhibit the effect of that method as packages indices | ||||
correspond to package names and thus do not respect any kind | correspond to package names and thus do not respect any kind | ||||
of fixed length string pattern | of fixed length string pattern | ||||
""" | """ | ||||
pass | pass | ||||
class NpmLister(NpmListerBase): | |||||
"""List all packages available in the npm registry in a paginated way | |||||
""" | |||||
PATH_TEMPLATE = '/_all_docs?startkey="%s"' | |||||
def get_next_target_from_response(self, response): | |||||
"""(Override) Get next npm package name to continue the listing | |||||
""" | |||||
repos = response.json()['rows'] | |||||
return repos[-1]['id'] if len(repos) == self.per_page else None | |||||
def transport_response_simplified(self, response): | |||||
"""(Override) Transform npm registry response to list for model manipulation | |||||
""" | |||||
repos = response.json()['rows'] | |||||
if len(repos) == self.per_page: | |||||
repos = repos[:-1] | |||||
return [self.get_model_from_repo(repo['id']) for repo in repos] | |||||
class NpmIncrementalLister(NpmListerBase): | |||||
"""List packages in the npm registry, updated since a specific | |||||
update_seq value of the underlying CouchDB database, in a paginated way | |||||
""" | |||||
PATH_TEMPLATE = '/_changes?since=%s' | |||||
@property | |||||
def CONFIG_BASE_FILENAME(self): # noqa: N802 | |||||
return 'lister-npm-incremental' | |||||
def get_next_target_from_response(self, response): | |||||
"""(Override) Get next npm package name to continue the listing | |||||
""" | |||||
repos = response.json()['results'] | |||||
return repos[-1]['seq'] if len(repos) == self.per_page else None | |||||
def transport_response_simplified(self, response): | |||||
"""(Override) Transform npm registry response to list for model manipulation | |||||
""" | |||||
repos = response.json()['results'] | |||||
if len(repos) == self.per_page: | |||||
repos = repos[:-1] | |||||
return [self.get_model_from_repo(repo['id']) for repo in repos] | |||||
def filter_before_inject(self, models_list): | |||||
"""(Override) Filter out documents in the CouchDB database | |||||
not related to a npm package | |||||
""" | |||||
models_filtered = [] | |||||
for model in models_list: | |||||
package_name = model['name'] | |||||
# document related to CouchDB internals | |||||
if package_name.startswith('_design/'): | |||||
continue | |||||
models_filtered.append(model) | |||||
return models_filtered | |||||
def disable_deleted_repo_tasks(self, start, end, keep_these): | |||||
Not Done Inline Actionsi do not see anything on the file passing this line, don't you have to add the pass keyword at least? ardumont: i do not see anything on the file passing this line, don't you have to add the `pass` keyword… | |||||
Done Inline ActionsGood catch! anlambert: Good catch! | |||||
"""(Override) Disable the processing performed by that method | |||||
as it is not relevant in this incremental lister context | |||||
and it raises and exception due to a different index type | |||||
(int instead of str) | |||||
""" | |||||
pass |
Why oneshot?