diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py new file mode 100644 --- /dev/null +++ b/docs/new_lister_template.py @@ -0,0 +1,166 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import asdict, dataclass +import logging +from typing import Any, Dict, Iterator, List +from urllib.parse import urljoin + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +NewForgeListerPage = List[Dict[str, Any]] + + +@dataclass +class NewForgeListerState: + """The NewForgeLister instance state. This is used for incremental listing. + + """ + + current: str = "" + """Id of the last origin listed on an incremental pass""" + + +# If there is no need to keep state, subclass StatelessLister[NewForgeListerPage] +class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]): + """List origins from the "NewForge" forge. + + """ + + # Part of the lister API, that identifies this lister + LISTER_NAME = "" + # (Optional) CVS type of the origins listed by this lister, if constant + VISIT_TYPE = "" + + # Instance URLs include the hostname and the common path prefix of processed URLs + EXAMPLE_BASE_URL = "https://netloc/api/v1/" + # Path of a specific resource to process, to join the base URL with + EXAMPLE_PATH = "origins/list" + + def __init__( + self, + # Required + scheduler: SchedulerInterface, + # Instance URL, required for multi-instances listers (e.g gitlab, ...) + url: str, + # Instance name (free form) required for multi-instance listers, + # or computed from `url` + instance: str, + # Required whether lister supports authentication or not + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, credentials=credentials, url=url, instance=instance, + ) + + self.session = requests.Session() + # Declare the USER_AGENT is more sysadm-friendly for the forge we list + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState: + return NewForgeListerState(**d) + + def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]: + return asdict(state) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url, params) -> requests.Response: + # Do the network resource request under a retrying decorator + # to handle rate limiting and transient errors up to a limit. + # `throttling_retry` by default use the `requests` library to check + # only for rate-limit and a base-10 exponential waiting strategy. + # This can be customized by passed waiting, retrying and logging strategies + # as functions. See the `tenacity` library documentation. + + # Log listed URL to ease debugging + logger.debug("Fetching URL %s with params %s", url, params) + response = self.session.get(url, params=params) + + if response.status_code != 200: + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + # The lister must fail on blocking errors + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[NewForgeListerPage]: + # The algorithm depends on the service, but should request data reliably, + # following pagination if relevant and yielding pages in a streaming fashion. + # If incremental listing is supported, initialize from saved lister state. + # Make use of any next page URL provided. + # Simplify the results early to ease testing and debugging. + + # Initialize from the lister saved state + current = "" + if self.state.current is not None: + current = self.state.current + + # Construct the URL of a service endpoint, the lister can have others to fetch + url = urljoin(self.url, self.EXAMPLE_PATH) + + while current is not None: + # Parametrize the request for incremental listing + body = self.page_request(url, {"current": current}).json() + + # Simplify the page if possible to only the necessary elements + # and yield it + yield body + + # Get the next page parameter or end the loop when there is none + current = body.get("next") + + def get_origins_from_page(self, page: NewForgeListerPage) -> Iterator[ListedOrigin]: + """Convert a page of NewForgeLister repositories into a list of ListedOrigins""" + assert self.lister_obj.id is not None + + for element in page: + + yield ListedOrigin( + # Required. Should use this value. + lister_id=self.lister_obj.id, + # Required. Visit type of the currently processed origin + visit_type=self.VISIT_TYPE, + # Required. URL corresponding to the origin for loaders to ingest + url=..., + # Should get it if the service provides it and if it induces no + # substantial additional processing cost + last_update=..., + ) + + def commit_page(self, page: NewForgeListerPage) -> None: + # Update the lister state to the latest `current` + current = page[-1]["current"] + + if current > self.state.current: + self.state.current = current + + def finalize(self) -> None: + # Pull fresh lister state from the scheduler backend, in case multiple + # listers run concurrently + scheduler_state = self.get_state_from_scheduler() + + # Update the lister state in the backend only if `current` is fresher than + # the one stored in the database. + if self.state.current > scheduler_state.current: + self.updated = True diff --git a/docs/tutorial.rst b/docs/tutorial-2017.rst copy from docs/tutorial.rst copy to docs/tutorial-2017.rst --- a/docs/tutorial.rst +++ b/docs/tutorial-2017.rst @@ -1,4 +1,4 @@ -.. _lister-tutorial: +.. _lister-tutorial-2017: Tutorial: list the content of your favorite forge in just a few steps ===================================================================== @@ -79,7 +79,8 @@ .. figure:: images/new_bitbucket_lister.png -And now this is common shared code in a few abstract base classes, with some new features and loads of docstring comments (in red): +And now this is common shared code in a few abstract base classes, with some new +features and loads of docstring comments (in red): .. figure:: images/new_base.png @@ -215,41 +216,41 @@ from swh.lister.github.models import GitHubModel class GitHubLister(IndexingHttpLister): - PATH_TEMPLATE = '/repositories?since=%d' - MODEL = GitHubModel - - def get_model_from_repo(self, repo): - return {'uid': repo['id'], - 'indexable': repo['id'], - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['html_url'], - 'origin_url': repo['html_url'], - 'origin_type': 'git', - 'description': repo['description']} - - def get_next_target_from_response(self, response): - if 'next' in response.links: - next_url = response.links['next']['url'] - return int(next_url.split('since=')[1]) - else: - return None - - def transport_response_simplified(self, response): - repos = response.json() - return [self.get_model_from_repo(repo) for repo in repos] - - def request_headers(self): - return {'Accept': 'application/vnd.github.v3+json'} - - def transport_quota_check(self, response): - remain = int(response.headers['X-RateLimit-Remaining']) - if response.status_code == 403 and remain == 0: - reset_at = int(response.headers['X-RateLimit-Reset']) - delay = min(reset_at - time.time(), 3600) - return True, delay - else: - return False, 0 + PATH_TEMPLATE = '/repositories?since=%d' + MODEL = GitHubModel + + def get_model_from_repo(self, repo): + return {'uid': repo['id'], + 'indexable': repo['id'], + 'name': repo['name'], + 'full_name': repo['full_name'], + 'html_url': repo['html_url'], + 'origin_url': repo['html_url'], + 'origin_type': 'git', + 'description': repo['description']} + + def get_next_target_from_response(self, response): + if 'next' in response.links: + next_url = response.links['next']['url'] + return int(next_url.split('since=')[1]) + else: + return None + + def transport_response_simplified(self, response): + repos = response.json() + return [self.get_model_from_repo(repo) for repo in repos] + + def request_headers(self): + return {'Accept': 'application/vnd.github.v3+json'} + + def transport_quota_check(self, response): + remain = int(response.headers['X-RateLimit-Remaining']) + if response.status_code == 403 and remain == 0: + reset_at = int(response.headers['X-RateLimit-Reset']) + delay = min(reset_at - time.time(), 3600) + return True, delay + else: + return False, 0 We can see that there are some common elements: @@ -300,7 +301,7 @@ # main task ghl = GitHubLister(lister_name='github.com', - api_baseurl='https://github.com') + api_baseurl='https://github.com') ghl.run() ⇓ (IndexingLister.run):: @@ -309,8 +310,8 @@ identifier = None do - response, repos = ListerBase.ingest_data(identifier) - identifier = GitHubLister.get_next_target_from_response(response) + response, repos = ListerBase.ingest_data(identifier) + identifier = GitHubLister.get_next_target_from_response(response) while(identifier) ⇓ (ListerBase.ingest_data):: @@ -327,10 +328,10 @@ # ListerBase.safely_issue_request repeat: - resp = ListerHttpTransport.transport_request(identifier) - retry, delay = ListerHttpTransport.transport_quota_check(resp) - if retry: - sleep(delay) + resp = ListerHttpTransport.transport_request(identifier) + retry, delay = ListerHttpTransport.transport_quota_check(resp) + if retry: + sleep(delay) until((not retry) or too_many_retries) return resp @@ -339,7 +340,7 @@ # ListerHttpTransport.transport_request path = ListerBase.api_baseurl - + ListerHttpTransport.PATH_TEMPLATE % identifier + + ListerHttpTransport.PATH_TEMPLATE % identifier headers = ListerHttpTransport.request_headers() return http.get(path, headers) diff --git a/docs/tutorial.rst b/docs/tutorial.rst --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -3,363 +3,363 @@ Tutorial: list the content of your favorite forge in just a few steps ===================================================================== -(the `original version -`_ -of this article appeared on the Software Heritage blog) - -Back in November 2016, Nicolas Dandrimont wrote about structural code changes -`leading to a massive (+15 million!) upswing in the number of repositories -archived by Software Heritage -`_ -through a combination of automatic linkage between the listing and loading -scheduler, new understanding of how to deal with extremely large repository -hosts like `GitHub `_, and activating a new set of -repositories that had previously been skipped over. - -In the post, Nicolas outlined the three major phases of work in Software -Heritage's preservation process (listing, scheduling updates, loading) and -highlighted that the ability to preserve the world's free software heritage -depends on our ability to find and list the repositories. - -At the time, Software Heritage was only able to list projects on -GitHub. Focusing early on GitHub, one of the largest and most active forge in -the world, allowed for a big value-to-effort ratio and a rapid launch for the -archive. As the old Italian proverb goes, "Il meglio è nemico del bene," or in -modern English parlance, "Perfect is the enemy of good," right? Right. So the -plan from the beginning was to implement a lister for GitHub, then maybe -implement another one, and then take a few giant steps backward and squint our -eyes. - -Why? Because source code hosting services don't behave according to a unified -standard. Each new service requires dedicated development time to implement a -new scraping client for the non-transferable requirements and intricacies of -that service's API. At the time, doing it in an extensible and adaptable way -required a level of exposure to the myriad differences between these services -that we just didn't think we had yet. - -Nicolas' post closed by saying "We haven't carved out a stable API yet that -allows you to just fill in the blanks, as we only have the GitHub lister -currently, and a proven API will emerge organically only once we have some -diversity." - -That has since changed. As of March 6, 2017, the Software Heritage **lister -code has been aggressively restructured, abstracted, and commented** to make -creating new listers significantly easier. There may yet be a few kinks to iron -out, but **now making a new lister is practically like filling in the blanks**. +Overview +-------- + +The three major phases of work in Software Heritage's preservation process, on the +technical side, are *listing software sources*, *scheduling updates* and *loading the +software artifacts into the archive*. + +A previous effort in 2017 consisted in designing the framework to make lister a +straightforward "fill in the blanks" process, based on gained experience on the +diversity found in the listed services. This is the second iteration on the lister +framework design, comprising a library and an API which is easier to work with and less +"magic" (read implicit). This new design is part of a larger effort in redesigning the +scheduling system for the recurring tasks updating the content of the archive. + +.. _fundamentals: + +Fundamentals +------------ Fundamentally, a basic lister must follow these steps: 1. Issue a network request for a service endpoint. -2. Convert the response into a canonical format. -3. Populate a work queue for fetching and ingesting source repositories. - -Steps 1 and 3 are generic problems, so they can get generic solutions hidden -away in the base code, most of which never needs to change. That leaves us to -implement step 2, which can be trivially done now for services with a clean web -APIs. - -In the new code, we've tried to hide away as much generic functionality as -possible, turning it into set-and-forget plumbing between a few simple -customized elements. Different hosting services might use different network -protocols, rate-limit messages, or pagination schemes, but, as long as there is -some way to get a list of the hosted repositories, we think that the new base -code will make getting those repositories much easier. - -First, let me give you the 30,000 foot view… - -The old GitHub-specific lister code looked like this (265 lines of Python): - -.. figure:: images/old_github_lister.png - -By contrast, the new GitHub-specific code looks like this (34 lines of Python): - -.. figure:: images/new_github_lister.png - -And the new BitBucket-specific code is even shorter and looks like this (24 lines of Python): - -.. figure:: images/new_bitbucket_lister.png - -And now this is common shared code in a few abstract base classes, with some new features and loads of docstring comments (in red): - -.. figure:: images/new_base.png - -So how does the lister code work now, and **how might a contributing developer -go about making a new one** - -The first thing to know is that we now have a generic lister base class and ORM -model. A subclass of the lister base should already be able to do almost -everything needed to complete a listing task for a single service -request/response cycle with the following implementation requirements: - -1. A member variable must be declared called ``MODEL``, which is equal to a - subclass (Note: type, not instance) of the base ORM model. The reasons for - using a subclass is mostly just because different services use different - incompatible primary identifiers for their repositories. The model - subclasses are typically only one or two additional variable declarations. - -2. A method called ``transport_request`` must be implemented, which takes the - complete target identifier (e.g., a URL) and tries to request it one time - using whatever transport protocol is required for interacting with the - service. It should not attempt to retry on timeouts or do anything else with - the response (that is already done for you). It should just either return - the response or raise a ``FetchError`` exception. - -3. A method called ``transport_response_to_string`` must be implemented, which - takes the entire response of the request in (1) and converts it to a string - for logging purposes. - -4. A method called ``transport_quota_check`` must be implemented, which takes - the entire response of the request in (1) and checks to see if the process - has run afoul of any query quotas or rate limits. If the service says to - wait before making more requests, the method should return ``True`` and also - the number of seconds to wait, otherwise it returns ``False``. - -5. A method called ``transport_response_simplified`` must be implemented, which - also takes the entire response of the request in (1) and converts it to a - Python list of dicts (one dict for each repository) with keys given - according to the aforementioned ``MODEL`` class members. - -Because 1, 2, 3, and 4 are basically dependent only on the chosen network -protocol, we also have an HTTP mix-in module, which supplements the lister base -and provides default implementations for those methods along with optional -request header injection using the Python Requests library. The -``transport_quota_check`` method as provided follows the IETF standard for -communicating rate limits with `HTTP code 429 -`_ which some hosting services -have chosen not to follow, so it's possible that a specific lister will need to -override it. - -On top of all of that, we also provide another layer over the base lister class -which adds support for sequentially looping over indices. What are indices? -Well, some services (`BitBucket `_ and GitHub for -example) don't send you the entire list of all of their repositories at once, -because that server response would be unwieldy. Instead they paginate their -results, and they also allow you to query their APIs like this: -``https://server_address.tld/query_type?start_listing_from_id=foo``. Changing -the value of 'foo' lets you fetch a set of repositories starting from there. We -call 'foo' an index, and we call a service that works this way an indexing -service. GitHub uses the repository unique identifier and BitBucket uses the -repository creation time, but a service can really use anything as long as the -values monotonically increase with new repositories. A good indexing service -also includes the URL of the next page with a later 'foo' in its responses. For -these indexing services we provide another intermediate lister called the -indexing lister. Instead of inheriting from :class:`ListerBase -`, the lister class would inherit -from :class:`IndexingLister -`. Along with the -requirements of the lister base, the indexing lister base adds one extra -requirement: - -1. A method called ``get_next_target_from_response`` must be defined, which - takes a complete request response and returns the index ('foo' above) of the - next page. - -So those are all the basic requirements. There are, of course, a few other -little bits and pieces (covered for now in the code's docstring comments), but -for the most part that's it. It sounds like a lot of information to absorb and -implement, but remember that most of the implementation requirements mentioned -above are already provided for 99% of services by the HTTP mix-in module. It -looks much simpler when we look at the actual implementations of the two -new-style indexing listers we currently have… +2. Convert the response data into a model object. +3. Send the model object to the scheduler. -When developing a new lister, it's important to test. For this, add the tests -(check `swh/lister/*/tests/`) and register the celery tasks in the main -conftest.py (`swh/lister/core/tests/conftest.py`). +Steps 1 and 3 are generic problems, that are often already solved by helpers or in other +listers. That leaves us mainly to implement step 2, which is simple when the remote +service provides an API. + +.. _prerequisites: + +Prerequisites +------------- + +Skills: + +* object-oriented Python +* requesting remote services through HTTP +* scrapping if no API is offered + +Analysis of the target service. Prepare the following elements to write the lister: + +* instance names and URLs +* requesting scheme: base URL, path, query_string, POST data, headers +* authentication types and which one to support, if any +* rate-limiting: HTTP codes and headers used +* data format: JSON/XML/HTML/...? +* mapping between remote data and needed data (ListedOrigin model, internal state) + +We will now walk through the steps to build a new lister. +Please use this template to start with: :download:`new_lister_template.py` + +.. _lister-declaration: + +Lister declaration +------------------ + +In order to write a lister, two basic elements are required. These are the +:py:class:`Lister` base class and the :py:class:`ListedOrigin` scheduler model class. +Optionally, for listers that need to keep a state and support incremental listing, an +additional object :py:class:`ListerState` will come into play. + +Each lister must subclass :py:class:`Lister ` either directly +or through a subclass such as :py:class:`StatelessLister +` for stateless ones. + +We extensively type-annotate our listers, as any new code, which makes proeminent that +those lister classes are generic, and take the following parameters: + +* :py:class:`Lister`: the lister state type, the page type +* :py:class:`StatelessLister`: only the page type -Another important step is to actually run it within the -docker-dev (:ref:`run-lister-tutorial`). - -This is the entire source code for the BitBucket repository lister:: - - # Copyright (C) 2017 the Software Heritage developers - # License: GNU General Public License version 3 or later - # See top-level LICENSE file for more information - - from urllib import parse - from swh.lister.bitbucket.models import BitBucketModel - from swh.lister.core.indexing_lister import IndexingHttpLister - - class BitBucketLister(IndexingHttpLister): - PATH_TEMPLATE = '/repositories?after=%s' - MODEL = BitBucketModel - - def get_model_from_repo(self, repo): - return {'uid': repo['uuid'], - 'indexable': repo['created_on'], - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['links']['html']['href'], - 'origin_url': repo['links']['clone'][0]['href'], - 'origin_type': repo['scm'], - 'description': repo['description']} - - def get_next_target_from_response(self, response): - body = response.json() - if 'next' in body: - return parse.unquote(body['next'].split('after=')[1]) - else: - return None - - def transport_response_simplified(self, response): - repos = response.json()['values'] - return [self.get_model_from_repo(repo) for repo in repos] - -And this is the entire source code for the GitHub repository lister:: - - # Copyright (C) 2017 the Software Heritage developers - # License: GNU General Public License version 3 or later - # See top-level LICENSE file for more information - - import time - from swh.lister.core.indexing_lister import IndexingHttpLister - from swh.lister.github.models import GitHubModel - - class GitHubLister(IndexingHttpLister): - PATH_TEMPLATE = '/repositories?since=%d' - MODEL = GitHubModel - - def get_model_from_repo(self, repo): - return {'uid': repo['id'], - 'indexable': repo['id'], - 'name': repo['name'], - 'full_name': repo['full_name'], - 'html_url': repo['html_url'], - 'origin_url': repo['html_url'], - 'origin_type': 'git', - 'description': repo['description']} - - def get_next_target_from_response(self, response): - if 'next' in response.links: - next_url = response.links['next']['url'] - return int(next_url.split('since=')[1]) - else: - return None - - def transport_response_simplified(self, response): - repos = response.json() - return [self.get_model_from_repo(repo) for repo in repos] - - def request_headers(self): - return {'Accept': 'application/vnd.github.v3+json'} - - def transport_quota_check(self, response): - remain = int(response.headers['X-RateLimit-Remaining']) - if response.status_code == 403 and remain == 0: - reset_at = int(response.headers['X-RateLimit-Reset']) - delay = min(reset_at - time.time(), 3600) - return True, delay - else: - return False, 0 +You can can start by declaring a stateless lister and leave the implementation of state +for later if the listing needs it. We will see how to in :ref:`handling-lister-state`. -We can see that there are some common elements: +Both the lister state type and the page type are user-defined types. However, while the +page type may only exist as a type annotation, the state type for a stateful lister must +be associated with a concrete object. The state type is commonly defined as a dataclass +whereas the page type is often a mere annotation, potentially given a nice alias. -* Both use the HTTP transport mixin (:class:`IndexingHttpLister - `) just combines - :class:`ListerHttpTransport - ` and - :class:`IndexingLister - `) to get most of the - network request functionality for free. +Example lister declaration:: -* Both also define ``MODEL`` and ``PATH_TEMPLATE`` variables. It should be - clear to developers that ``PATH_TEMPLATE``, when combined with the base - service URL (e.g., ``https://some_service.com``) and passed a value (the - 'foo' index described earlier) results in a complete identifier for making - API requests to these services. It is required by our HTTP module. + NewForgePage = List[Dict[str, Any]] -* Both services respond using JSON, so both implementations of - ``transport_response_simplified`` are similar and quite short. + @dataclass + class NewForgeListerState: + ... -We can also see that there are a few differences: + class NewForgeLister(Lister[NewForgeListerState, NewForgePage]): + LISTER_NAME = "My" + ... -* GitHub sends the next URL as part of the response header, while BitBucket - sends it in the response body. +The new lister must declare a name through the :py:attr:`LISTER_NAME` class attribute. -* GitHub differentiates API versions with a request header (our HTTP - transport mix-in will automatically use any headers provided by an - optional request_headers method that we implement here), while - BitBucket has it as part of their base service URL. BitBucket uses - the IETF standard HTTP 429 response code for their rate limit - notifications (the HTTP transport mix-in automatically handles - that), while GitHub uses their own custom response headers that need - special treatment. +.. _lister-construction: -* But look at them! 58 lines of Python code, combined, to absorb all - repositories from two of the largest and most influential source code hosting - services. +Lister construction +------------------- -Ok, so what is going on behind the scenes? +The lister constructor is only required to ask for a :py:class:`SchedulerInterface` +object to pass to the base class. But it does not mean that it is all that's needed for +it to useful. A lister need information on which remote service to talk to. It needs an +URL. -To trace the operation of the code, let's start with a sample instantiation and -progress from there to see which methods get called when. What follows will be -a series of extremely reductionist pseudocode methods. This is not what the -code actually looks like (it's not even real code), but it does have the same -basic flow. Bear with me while I try to lay out lister operation in a -quasi-linear way…:: +Some services are centralized and offered by a single organization. Think of Github. +Others are offered by many people across the Internet, each using a different hosting, +each providing specific data. Think of the many Gitlab instances. We need a name to +identify each instance, and even if there is only one, we need its URL to access it +concretely. - # main task +Now, you may think of any strategy to infer the information or hardcode it, but the base +class needs an URL and an instance name. In any case, for a multi-instance service, you +better be explicit and require the URL as constructor argument. We recommend the URL to +be some form of a base URL, to be concatenated with any variable part appearing either +because there exist multiple instances or the URL need recomputation in the listing +process. - ghl = GitHubLister(lister_name='github.com', - api_baseurl='https://github.com') - ghl.run() +If we need any credentials to access a remote service, and do so in our polite but +persistent fashion (remember that we want fresh information), you are encouraged to +provide support for authenticated access. The base class support handling credentials as +a set of identifier/secret pair. It knows how to load from a secrets store the right +ones for the current ("lister name", "instance name") setting, if none were originally +provided through the task parameters. You can ask for other types of access tokens in a +separate parameter, but then you lose this advantage. -⇓ (IndexingLister.run):: +Example of a typical lister constructor:: - # IndexingLister.run + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: str, + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, url=url, instance=instance, credentials=credentials, + ) + ... - identifier = None - do - response, repos = ListerBase.ingest_data(identifier) - identifier = GitHubLister.get_next_target_from_response(response) - while(identifier) +.. _core-lister-functionality: -⇓ (ListerBase.ingest_data):: +Core lister functionality +------------------------- - # ListerBase.ingest_data +For the lister to contribute data to the archive, you now have to write the logic to +fetch data from the remote service, and format it in the canonical form the scheduler +expects, as outined in :ref:`fundamentals`. To this purpose, the two methods to +implement are:: - response = ListerBase.safely_issue_request(identifier) - repos = GitHubLister.transport_response_simplified(response) - injected = ListerBase.inject_repo_data_into_db(repos) - return response, injected + def get_pages(self) -> Iterator[NewForgePage]: + ... -⇓ (ListerBase.safely_issue_request):: + def get_origins_from_page(self, page: NewForgePage) -> Iterator[ListedOrigin]: + ... - # ListerBase.safely_issue_request +Those two core functions are called by the principal lister method, +:py:meth:`Lister.run`, found in the base class. - repeat: - resp = ListerHttpTransport.transport_request(identifier) - retry, delay = ListerHttpTransport.transport_quota_check(resp) - if retry: - sleep(delay) - until((not retry) or too_many_retries) - return resp +:py:meth:`get_pages` is the guts of the lister. It takes no arguments and must produce +data pages. An iterator is fine here, as the :py:meth:`Lister.run` method only mean to +iterate in a single pass on it. This method gets its input from a network request to a +remote service's endpoint to retrieve the data we long for. -⇓ (ListerHttpTransport.transport_request):: +Depending on whether the data is adequately structured for our purpose can be tricky. +Here you may have to show off your data scraping skills, or just consume a well-designed +API. Those aspects are discussed more specifically in the section +:ref:`handling-specific-topics`. - # ListerHttpTransport.transport_request +In any case, we want the data we return to be usefully filtered and structured. The +easiest way to create an iterator is to use the `yield` keyword. Yield each data page +you have structured in accordance with the page type you have declared. The page type +exists only for static type checking of data passed from :py:meth:`get_pages` to +:py:meth:`get_origins_from_page`; you can choose whatever fits the bill. - path = ListerBase.api_baseurl - + ListerHttpTransport.PATH_TEMPLATE % identifier - headers = ListerHttpTransport.request_headers() - return http.get(path, headers) +:py:meth:`get_origins_from_page` is simpler. For each individual software origin you +have received in the page, you convert and yield a :py:class:`ListedOrigin` model +object. This datatype has the following mandatory fields: -(Oh look, there's our ``PATH_TEMPLATE``) +* lister id: you generally fill this with the value of :py:attr:`self.lister_obj.id` -⇓ (ListerHttpTransport.request_headers):: +* visit type: the type of software distribution format the service provides. For use by + a corresponding loader. It is an identifier, so you have to either use an existing + value or craft a new one if you get off the beaten track and tackle a new software + source. But then you will have to discuss the name with the core developers. + + Example: Phabricator is a forge that can handle Git or SVN repositories. The visit + type would be "git" when listing such a repo that provides a Git URL that we can load. + +* origin URL: an URL that, combined with the visit type, will serve as the input of + loader. + +This datatype can also further be detailed with the optional fields: + +* last update date: freshness information on this origin, which is useful to the + scheduler for optimizing its scheduling decisions. Fill it if provided by the service, + at no substantial additional runtime cost, e.g. in the same request. + + * extra loader arguments: extra parameters to be passed to the loader for it to be + able to load the origin. It is needed for example when additional context is needed + along with the URL to effectively load from the origin. + +See the definition of ListedOrigin_. + +Now that that we showed how those two methods operate, let's put it together by showing +how they fit in the principal :py:meth:`Lister.run` method:: + + def run(self) -> ListerStats: + + full_stats = ListerStats() + + try: + for page in self.get_pages(): + full_stats.pages += 1 + origins = self.get_origins_from_page(page) + full_stats.origins += self.send_origins(origins) + self.commit_page(page) + finally: + self.finalize() + if self.updated: + self.set_state_in_scheduler() + + return full_stats - # ListerHttpTransport.request_headers +:py:meth:`Lister.send_origins` is the method that sends listed origins to the scheduler. - override → GitHubLister.request_headers +The :py:class:`ListerState` datastructure, defined along the base lister class, is used +to compute the number of listed pages and origins in a single lister run. It is useful +both for the scheduler that automatically collects this information and to test the +lister. + +You see that the bulk of a lister run consists in streaming data gathered from the +remote service to the scheduler. And this is done under a ``try...finally`` construct to +have the lister state reliably recorded in case of unhandled error. We will explain the +role of the remaining methods and attributes appearing here in the next section as it is +related to the lister state. -↑↑ (ListerBase.safely_issue_request) +.. _ListedOrigin: https://archive.softwareheritage.org/browse/swh:1:rev:03460207a17d82635ef5a6f12358392143eb9eef/?origin_url=https://forge.softwareheritage.org/source/swh-scheduler.git&path=swh/scheduler/model.py&revision=03460207a17d82635ef5a6f12358392143eb9eef#L134-L177 + +.. _handling-lister-state: + +Handling lister state +--------------------- + +With what we have covered until now you can write a stateless lister. Unfortunately, +some services provide too much data to efficiently deal with it in a one-shot fashion. +Listing a given software source can take several hours or days to process. Our listers +can also give valid output, but fail on an unexpected condition and would have to start +over. As we want to be able to resume the listing process from a given element, provided +by the remote service and guaranteed to be ordered, such as a date or a numeric +identifier, we need to deal with state. + +The remaining part of the lister API is reserved for dealing with lister state. + +If the service to list has no pagination, then the data set to handle is small enough to +not require keeping lister state. In the opposite case, you will have to determine which +piece of information should be recorded in the lister state. As said earlier, we +recommend declaring a dataclass for the lister state:: + + @dataclass + class NewForgeListerState: + current: str = "" + + class NewForgeLister(Lister[NewForgeListerState, NewForgePage]): + ... + +A pair of methods, :py:meth:`state_from_dict` and :py:meth:`state_to_dict` are used to +respectively import lister state from the scheduler and export lister state to the +scheduler. Some fields may need help to be serialized to the scheduler, such as dates, +so this needs to be handled there. + +Where is the state used? Taking the general case of a paginating service, the lister +state is used at the beginning of the :py:meth:`get_pages` method to initialize the +variables associated with the last listing progress. That way we can start from an +arbitrary element, or just the first one if there is no last lister state. + +The :py:meth:`commit_page` is called on successful page processing, after the new +origins are sent to the scheduler. Here you should mainly update the lister state by +taking into account the new page processed, e.g. advance a date or serial field. + +Finally, upon either completion or error, the :py:meth:`finalize` is called. There you +must set attribute :py:attr:`updated` to True if you were successful in advancing in the +listing process. To do this you will commonly retrieve the latest saved lister state +from the scheduler and compare with your current lister state. If lister state was +updated, ultimately the current lister state will be recorded in the scheduler. + +We have now seen the stateful lister API. Note that some listers may implement more +flexibility in the use of lister state. Some allow an `incremental` parameter that +governs whether or not we will do a stateful listing or not. It is up to you to support +additional functionality if it seems relevant. + +.. _handling-specific-topics: + +Handling specific topics +------------------------ + +Here is a quick coverage of common topics left out from lister construction and +:py:meth:`get_pages` descriptions. + +Sessions +^^^^^^^^ + +When requesting a web service repeatedly, most parameters including headers do not +change and could be set up once initially. We recommend setting up a e.g. HTTP session, +as instance attribute so that further requesting code can focus on what really changes. +Some ubiquitous HTTP headers include "Accept" to set to the service response format and +"User-Agent" for which we provide a recommended value :py:const:`USER_AGENT` to be +imported from :py:mod:`swh.lister`. Authentication is also commonly provided through +headers, so you can also set it up in the session. + +Transport error handling +^^^^^^^^^^^^^^^^^^^^^^^^ + +We generally recommend logging every unhandleable error with the response content and +then immediately stop the listing by doing an equivalent of +:py:meth:`Response.raise_for_status` from the `requests` library. As for rate-limiting +errors, we have a strategy of using a flexible decorator to handle the retrying for us. +It is based on the `tenacity` library and accessible as :py:func:`throttling_retry` from +:py:mod:`swh.lister.utils`. + +Pagination +^^^^^^^^^^ + +This one is a moving target. You have to understand how the pagination mechanics of the +particular service works. Some guidelines though. The identifier may be minimal (an id +to pass as query parameter), compound (a set of such parameters) or complete (a whole +URL). If the service provides the next URL, use it. The piece of information may be +found either in the response body, or in a header. Once identified, you still have to +implement the logic of requesting and extracting it in a loop and quitting the loop when +there is no more data to fetch. + +Page results +^^^^^^^^^^^^ + +First, when retrieving page results, which involves some protocols and parsing logic, +please make sure that any deviance from what was expected will result in an +informational error. You also have to simplify the results, both with filtering request +parameters if the service supports it, and by extracting from the response only the +information needed into a structured page. This all makes for easier debugging. + +Testing your lister +------------------- + +When developing a new lister, it's important to test. For this, add the tests +(check `swh/lister/*/tests/`) and register the celery tasks in the main +conftest.py (`swh/lister/core/tests/conftest.py`). -⇓ (ListerHttpTransport.transport_quota_check):: +Another important step is to actually run it within the docker-dev +(:ref:`run-lister-tutorial`). - # ListerHttpTransport.transport_quota_check +More about listers +------------------ - override → GitHubLister.transport_quota_check +See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ). -And then we're done. From start to finish, I hope this helps you understand how -the few customized pieces fit into the new shared plumbing. +Old (2017) lister tutorial :ref:`lister-tutorial-2017` -Now you can go and write up a lister for a code hosting site we don't have yet! +.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py +.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py +.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py +.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py