Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/core/paging_lister.py
- This file was added.
# Copyright (C) 2015-2018 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import abc | |||||
import logging | |||||
from .lister_transports import SWHListerHttpTransport | |||||
from .lister_base import SWHListerBase | |||||
class PageByPageLister(SWHListerBase): | |||||
"""Lister* intermediate class for any service that follows the simple | |||||
pagination page pattern. | |||||
- Client sends a request to list repositories starting from a | |||||
given page identifier. | |||||
- Client receives structured (json/xml/etc) response with | |||||
information about a sequential series of repositories (per page) | |||||
starting from a given index. And, if available, some indication | |||||
of the next page index for fetching the remaining repository | |||||
data. | |||||
See :class:`swh.lister.core.lister_base.SWHListerBase` for more | |||||
details. | |||||
This class cannot be instantiated. To create a new Lister for a | |||||
source code listing service that follows the model described | |||||
above, you must subclass this class. Then provide the required | |||||
overrides in addition to any unmet implementation/override | |||||
requirements of this class's base (see parent class and member | |||||
docstrings for details). | |||||
Required Overrides:: | |||||
def get_next_target_from_response | |||||
""" | |||||
@abc.abstractmethod | |||||
def get_next_target_from_response(self, response): | |||||
"""Find the next server endpoint page given the entire response. | |||||
Implementation of this method depends on the server API spec | |||||
and the shape of the network response object returned by the | |||||
transport_request method. | |||||
For example, some api can use the headers links to provide the | |||||
next page. | |||||
Args: | |||||
response (transport response): response page from the server | |||||
Returns: | |||||
index of next page, possibly extracted from a next href url | |||||
""" | |||||
pass | |||||
@abc.abstractmethod | |||||
def get_pages_information(self): | |||||
"""Find the total number of pages. | |||||
Implementation of this method depends on the server API spec | |||||
and the shape of the network response object returned by the | |||||
transport_request method. | |||||
For example, some api can use dedicated headers: | |||||
- x-total-pages to provide the total number of pages | |||||
- x-total to provide the total number of repositories | |||||
- x-per-page to provide the number of elements per page | |||||
Returns: | |||||
tuple (total number of repositories, total number of | |||||
pages, per_page) | |||||
""" | |||||
pass | |||||
# You probably don't need to override anything below this line. | |||||
def do_additional_checks(self, models_list): | |||||
"""Potentially check for existence of repositories in models_list. | |||||
This will be called only if check_existence is flipped on in | |||||
the run method below. | |||||
""" | |||||
for m in models_list: | |||||
sql_repo = self.db_query_equal('uid', m['uid']) | |||||
if sql_repo: | |||||
return False | |||||
return models_list | |||||
def run(self, min_bound=None, max_bound=None, check_existence=False): | |||||
"""Main entry function. Sequentially fetches repository data from the | |||||
service according to the basic outline in the class | |||||
docstring. Continually fetching sublists until either there | |||||
is no next page reference given or the given next page is | |||||
greater than the desired max_page. | |||||
Args: | |||||
min_bound: optional page to start from | |||||
max_bound: optional page to stop at | |||||
check_existence (bool): optional existence check (for | |||||
incremental lister whose sort | |||||
order is inverted) | |||||
Returns: | |||||
nothing | |||||
""" | |||||
page = min_bound or 0 | |||||
loop_count = 0 | |||||
self.min_page = min_bound | |||||
self.max_page = max_bound | |||||
while self.is_within_bounds(page, self.min_page, self.max_page): | |||||
logging.info('listing repos starting at %s' % page) | |||||
response, injected_repos = self.ingest_data(page, | |||||
checks=check_existence) | |||||
if not injected_repos: | |||||
logging.info('Repositories already seen, stopping') | |||||
break | |||||
next_page = self.get_next_target_from_response(response) | |||||
# termination condition | |||||
if (next_page is None) or (next_page == page): | |||||
logging.info('stopping after page %s, no next link found' % | |||||
page) | |||||
break | |||||
else: | |||||
page = next_page | |||||
loop_count += 1 | |||||
if loop_count == 20: | |||||
logging.info('flushing updates') | |||||
loop_count = 0 | |||||
self.db_session.commit() | |||||
self.db_session = self.mk_session() | |||||
self.db_session.commit() | |||||
self.db_session = self.mk_session() | |||||
class PageByPageHttpLister(SWHListerHttpTransport, PageByPageLister): | |||||
"""Convenience class for ensuring right lookup and init order when | |||||
combining PageByPageLister and SWHListerHttpTransport. | |||||
""" | |||||
def __init__(self, api_baseurl=None, override_config=None): | |||||
SWHListerHttpTransport.__init__(self, api_baseurl=api_baseurl) | |||||
PageByPageLister.__init__(self, override_config=override_config) |