diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py
new file mode 100644
--- /dev/null
+++ b/docs/new_lister_template.py
@@ -0,0 +1,166 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import asdict, dataclass
+import logging
+from typing import Any, Dict, Iterator, List
+from urllib.parse import urljoin
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, Lister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+NewForgeListerPage = List[Dict[str, Any]]
+
+
+@dataclass
+class NewForgeListerState:
+ """The NewForgeLister instance state. This is used for incremental listing.
+
+ """
+
+ current: str = ""
+ """Id of the last origin listed on an incremental pass"""
+
+
+# If there is no need to keep state, subclass StatelessLister[NewForgeListerPage]
+class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
+ """List origins from the "NewForge" forge.
+
+ """
+
+ # Part of the lister API, that identifies this lister
+ LISTER_NAME = ""
+ # (Optional) CVS type of the origins listed by this lister, if constant
+ VISIT_TYPE = ""
+
+ # Instance URLs include the hostname and the common path prefix of processed URLs
+ EXAMPLE_BASE_URL = "https://netloc/api/v1/"
+ # Path of a specific resource to process, to join the base URL with
+ EXAMPLE_PATH = "origins/list"
+
+ def __init__(
+ self,
+ # Required
+ scheduler: SchedulerInterface,
+ # Instance URL, required for multi-instances listers (e.g gitlab, ...)
+ url: str,
+ # Instance name (free form) required for multi-instance listers,
+ # or computed from `url`
+ instance: str,
+ # Required whether lister supports authentication or not
+ credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler, credentials=credentials, url=url, instance=instance,
+ )
+
+ self.session = requests.Session()
+ # Declare the USER_AGENT is more sysadm-friendly for the forge we list
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT}
+ )
+
+ def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState:
+ return NewForgeListerState(**d)
+
+ def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
+ return asdict(state)
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def page_request(self, url, params) -> requests.Response:
+ # Do the network resource request under a retrying decorator
+ # to handle rate limiting and transient errors up to a limit.
+ # `throttling_retry` by default use the `requests` library to check
+ # only for rate-limit and a base-10 exponential waiting strategy.
+ # This can be customized by passed waiting, retrying and logging strategies
+ # as functions. See the `tenacity` library documentation.
+
+ # Log listed URL to ease debugging
+ logger.debug("Fetching URL %s with params %s", url, params)
+ response = self.session.get(url, params=params)
+
+ if response.status_code != 200:
+ # Log response content to ease debugging
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ # The lister must fail on blocking errors
+ response.raise_for_status()
+
+ return response
+
+ def get_pages(self) -> Iterator[NewForgeListerPage]:
+ # The algorithm depends on the service, but should request data reliably,
+ # following pagination if relevant and yielding pages in a streaming fashion.
+ # If incremental listing is supported, initialize from saved lister state.
+ # Make use of any next page URL provided.
+ # Simplify the results early to ease testing and debugging.
+
+ # Initialize from the lister saved state
+ current = ""
+ if self.state.current is not None:
+ current = self.state.current
+
+ # Construct the URL of a service endpoint, the lister can have others to fetch
+ url = urljoin(self.url, self.EXAMPLE_PATH)
+
+ while current is not None:
+ # Parametrize the request for incremental listing
+ body = self.page_request(url, {"current": current}).json()
+
+ # Simplify the page if possible to only the necessary elements
+ # and yield it
+ yield body
+
+ # Get the next page parameter or end the loop when there is none
+ current = body.get("next")
+
+ def get_origins_from_page(self, page: NewForgeListerPage) -> Iterator[ListedOrigin]:
+ """Convert a page of NewForgeLister repositories into a list of ListedOrigins"""
+ assert self.lister_obj.id is not None
+
+ for element in page:
+
+ yield ListedOrigin(
+ # Required. Should use this value.
+ lister_id=self.lister_obj.id,
+ # Required. Visit type of the currently processed origin
+ visit_type=self.VISIT_TYPE,
+ # Required. URL corresponding to the origin for loaders to ingest
+ url=...,
+ # Should get it if the service provides it and if it induces no
+ # substantial additional processing cost
+ last_update=...,
+ )
+
+ def commit_page(self, page: NewForgeListerPage) -> None:
+ # Update the lister state to the latest `current`
+ current = page[-1]["current"]
+
+ if current > self.state.current:
+ self.state.current = current
+
+ def finalize(self) -> None:
+ # Pull fresh lister state from the scheduler backend, in case multiple
+ # listers run concurrently
+ scheduler_state = self.get_state_from_scheduler()
+
+ # Update the lister state in the backend only if `current` is fresher than
+ # the one stored in the database.
+ if self.state.current > scheduler_state.current:
+ self.updated = True
diff --git a/docs/tutorial.rst b/docs/tutorial-2017.rst
copy from docs/tutorial.rst
copy to docs/tutorial-2017.rst
--- a/docs/tutorial.rst
+++ b/docs/tutorial-2017.rst
@@ -1,4 +1,4 @@
-.. _lister-tutorial:
+.. _lister-tutorial-2017:
Tutorial: list the content of your favorite forge in just a few steps
=====================================================================
@@ -79,7 +79,8 @@
.. figure:: images/new_bitbucket_lister.png
-And now this is common shared code in a few abstract base classes, with some new features and loads of docstring comments (in red):
+And now this is common shared code in a few abstract base classes, with some new
+features and loads of docstring comments (in red):
.. figure:: images/new_base.png
@@ -215,41 +216,41 @@
from swh.lister.github.models import GitHubModel
class GitHubLister(IndexingHttpLister):
- PATH_TEMPLATE = '/repositories?since=%d'
- MODEL = GitHubModel
-
- def get_model_from_repo(self, repo):
- return {'uid': repo['id'],
- 'indexable': repo['id'],
- 'name': repo['name'],
- 'full_name': repo['full_name'],
- 'html_url': repo['html_url'],
- 'origin_url': repo['html_url'],
- 'origin_type': 'git',
- 'description': repo['description']}
-
- def get_next_target_from_response(self, response):
- if 'next' in response.links:
- next_url = response.links['next']['url']
- return int(next_url.split('since=')[1])
- else:
- return None
-
- def transport_response_simplified(self, response):
- repos = response.json()
- return [self.get_model_from_repo(repo) for repo in repos]
-
- def request_headers(self):
- return {'Accept': 'application/vnd.github.v3+json'}
-
- def transport_quota_check(self, response):
- remain = int(response.headers['X-RateLimit-Remaining'])
- if response.status_code == 403 and remain == 0:
- reset_at = int(response.headers['X-RateLimit-Reset'])
- delay = min(reset_at - time.time(), 3600)
- return True, delay
- else:
- return False, 0
+ PATH_TEMPLATE = '/repositories?since=%d'
+ MODEL = GitHubModel
+
+ def get_model_from_repo(self, repo):
+ return {'uid': repo['id'],
+ 'indexable': repo['id'],
+ 'name': repo['name'],
+ 'full_name': repo['full_name'],
+ 'html_url': repo['html_url'],
+ 'origin_url': repo['html_url'],
+ 'origin_type': 'git',
+ 'description': repo['description']}
+
+ def get_next_target_from_response(self, response):
+ if 'next' in response.links:
+ next_url = response.links['next']['url']
+ return int(next_url.split('since=')[1])
+ else:
+ return None
+
+ def transport_response_simplified(self, response):
+ repos = response.json()
+ return [self.get_model_from_repo(repo) for repo in repos]
+
+ def request_headers(self):
+ return {'Accept': 'application/vnd.github.v3+json'}
+
+ def transport_quota_check(self, response):
+ remain = int(response.headers['X-RateLimit-Remaining'])
+ if response.status_code == 403 and remain == 0:
+ reset_at = int(response.headers['X-RateLimit-Reset'])
+ delay = min(reset_at - time.time(), 3600)
+ return True, delay
+ else:
+ return False, 0
We can see that there are some common elements:
@@ -300,7 +301,7 @@
# main task
ghl = GitHubLister(lister_name='github.com',
- api_baseurl='https://github.com')
+ api_baseurl='https://github.com')
ghl.run()
⇓ (IndexingLister.run)::
@@ -309,8 +310,8 @@
identifier = None
do
- response, repos = ListerBase.ingest_data(identifier)
- identifier = GitHubLister.get_next_target_from_response(response)
+ response, repos = ListerBase.ingest_data(identifier)
+ identifier = GitHubLister.get_next_target_from_response(response)
while(identifier)
⇓ (ListerBase.ingest_data)::
@@ -327,10 +328,10 @@
# ListerBase.safely_issue_request
repeat:
- resp = ListerHttpTransport.transport_request(identifier)
- retry, delay = ListerHttpTransport.transport_quota_check(resp)
- if retry:
- sleep(delay)
+ resp = ListerHttpTransport.transport_request(identifier)
+ retry, delay = ListerHttpTransport.transport_quota_check(resp)
+ if retry:
+ sleep(delay)
until((not retry) or too_many_retries)
return resp
@@ -339,7 +340,7 @@
# ListerHttpTransport.transport_request
path = ListerBase.api_baseurl
- + ListerHttpTransport.PATH_TEMPLATE % identifier
+ + ListerHttpTransport.PATH_TEMPLATE % identifier
headers = ListerHttpTransport.request_headers()
return http.get(path, headers)
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -3,363 +3,363 @@
Tutorial: list the content of your favorite forge in just a few steps
=====================================================================
-(the `original version
-`_
-of this article appeared on the Software Heritage blog)
-
-Back in November 2016, Nicolas Dandrimont wrote about structural code changes
-`leading to a massive (+15 million!) upswing in the number of repositories
-archived by Software Heritage
-`_
-through a combination of automatic linkage between the listing and loading
-scheduler, new understanding of how to deal with extremely large repository
-hosts like `GitHub `_, and activating a new set of
-repositories that had previously been skipped over.
-
-In the post, Nicolas outlined the three major phases of work in Software
-Heritage's preservation process (listing, scheduling updates, loading) and
-highlighted that the ability to preserve the world's free software heritage
-depends on our ability to find and list the repositories.
-
-At the time, Software Heritage was only able to list projects on
-GitHub. Focusing early on GitHub, one of the largest and most active forge in
-the world, allowed for a big value-to-effort ratio and a rapid launch for the
-archive. As the old Italian proverb goes, "Il meglio è nemico del bene," or in
-modern English parlance, "Perfect is the enemy of good," right? Right. So the
-plan from the beginning was to implement a lister for GitHub, then maybe
-implement another one, and then take a few giant steps backward and squint our
-eyes.
-
-Why? Because source code hosting services don't behave according to a unified
-standard. Each new service requires dedicated development time to implement a
-new scraping client for the non-transferable requirements and intricacies of
-that service's API. At the time, doing it in an extensible and adaptable way
-required a level of exposure to the myriad differences between these services
-that we just didn't think we had yet.
-
-Nicolas' post closed by saying "We haven't carved out a stable API yet that
-allows you to just fill in the blanks, as we only have the GitHub lister
-currently, and a proven API will emerge organically only once we have some
-diversity."
-
-That has since changed. As of March 6, 2017, the Software Heritage **lister
-code has been aggressively restructured, abstracted, and commented** to make
-creating new listers significantly easier. There may yet be a few kinks to iron
-out, but **now making a new lister is practically like filling in the blanks**.
+Overview
+--------
+
+The three major phases of work in Software Heritage's preservation process, on the
+technical side, are *listing software sources*, *scheduling updates* and *loading the
+software artifacts into the archive*.
+
+A previous effort in 2017 consisted in designing the framework to make lister a
+straightforward "fill in the blanks" process, based on gained experience on the
+diversity found in the listed services. This is the second iteration on the lister
+framework design, comprising a library and an API which is easier to work with and less
+"magic" (read implicit). This new design is part of a larger effort in redesigning the
+scheduling system for the recurring tasks updating the content of the archive.
+
+.. _fundamentals:
+
+Fundamentals
+------------
Fundamentally, a basic lister must follow these steps:
1. Issue a network request for a service endpoint.
-2. Convert the response into a canonical format.
-3. Populate a work queue for fetching and ingesting source repositories.
-
-Steps 1 and 3 are generic problems, so they can get generic solutions hidden
-away in the base code, most of which never needs to change. That leaves us to
-implement step 2, which can be trivially done now for services with a clean web
-APIs.
-
-In the new code, we've tried to hide away as much generic functionality as
-possible, turning it into set-and-forget plumbing between a few simple
-customized elements. Different hosting services might use different network
-protocols, rate-limit messages, or pagination schemes, but, as long as there is
-some way to get a list of the hosted repositories, we think that the new base
-code will make getting those repositories much easier.
-
-First, let me give you the 30,000 foot view…
-
-The old GitHub-specific lister code looked like this (265 lines of Python):
-
-.. figure:: images/old_github_lister.png
-
-By contrast, the new GitHub-specific code looks like this (34 lines of Python):
-
-.. figure:: images/new_github_lister.png
-
-And the new BitBucket-specific code is even shorter and looks like this (24 lines of Python):
-
-.. figure:: images/new_bitbucket_lister.png
-
-And now this is common shared code in a few abstract base classes, with some new features and loads of docstring comments (in red):
-
-.. figure:: images/new_base.png
-
-So how does the lister code work now, and **how might a contributing developer
-go about making a new one**
-
-The first thing to know is that we now have a generic lister base class and ORM
-model. A subclass of the lister base should already be able to do almost
-everything needed to complete a listing task for a single service
-request/response cycle with the following implementation requirements:
-
-1. A member variable must be declared called ``MODEL``, which is equal to a
- subclass (Note: type, not instance) of the base ORM model. The reasons for
- using a subclass is mostly just because different services use different
- incompatible primary identifiers for their repositories. The model
- subclasses are typically only one or two additional variable declarations.
-
-2. A method called ``transport_request`` must be implemented, which takes the
- complete target identifier (e.g., a URL) and tries to request it one time
- using whatever transport protocol is required for interacting with the
- service. It should not attempt to retry on timeouts or do anything else with
- the response (that is already done for you). It should just either return
- the response or raise a ``FetchError`` exception.
-
-3. A method called ``transport_response_to_string`` must be implemented, which
- takes the entire response of the request in (1) and converts it to a string
- for logging purposes.
-
-4. A method called ``transport_quota_check`` must be implemented, which takes
- the entire response of the request in (1) and checks to see if the process
- has run afoul of any query quotas or rate limits. If the service says to
- wait before making more requests, the method should return ``True`` and also
- the number of seconds to wait, otherwise it returns ``False``.
-
-5. A method called ``transport_response_simplified`` must be implemented, which
- also takes the entire response of the request in (1) and converts it to a
- Python list of dicts (one dict for each repository) with keys given
- according to the aforementioned ``MODEL`` class members.
-
-Because 1, 2, 3, and 4 are basically dependent only on the chosen network
-protocol, we also have an HTTP mix-in module, which supplements the lister base
-and provides default implementations for those methods along with optional
-request header injection using the Python Requests library. The
-``transport_quota_check`` method as provided follows the IETF standard for
-communicating rate limits with `HTTP code 429
-`_ which some hosting services
-have chosen not to follow, so it's possible that a specific lister will need to
-override it.
-
-On top of all of that, we also provide another layer over the base lister class
-which adds support for sequentially looping over indices. What are indices?
-Well, some services (`BitBucket `_ and GitHub for
-example) don't send you the entire list of all of their repositories at once,
-because that server response would be unwieldy. Instead they paginate their
-results, and they also allow you to query their APIs like this:
-``https://server_address.tld/query_type?start_listing_from_id=foo``. Changing
-the value of 'foo' lets you fetch a set of repositories starting from there. We
-call 'foo' an index, and we call a service that works this way an indexing
-service. GitHub uses the repository unique identifier and BitBucket uses the
-repository creation time, but a service can really use anything as long as the
-values monotonically increase with new repositories. A good indexing service
-also includes the URL of the next page with a later 'foo' in its responses. For
-these indexing services we provide another intermediate lister called the
-indexing lister. Instead of inheriting from :class:`ListerBase
-`, the lister class would inherit
-from :class:`IndexingLister
-`. Along with the
-requirements of the lister base, the indexing lister base adds one extra
-requirement:
-
-1. A method called ``get_next_target_from_response`` must be defined, which
- takes a complete request response and returns the index ('foo' above) of the
- next page.
-
-So those are all the basic requirements. There are, of course, a few other
-little bits and pieces (covered for now in the code's docstring comments), but
-for the most part that's it. It sounds like a lot of information to absorb and
-implement, but remember that most of the implementation requirements mentioned
-above are already provided for 99% of services by the HTTP mix-in module. It
-looks much simpler when we look at the actual implementations of the two
-new-style indexing listers we currently have…
+2. Convert the response data into a model object.
+3. Send the model object to the scheduler.
-When developing a new lister, it's important to test. For this, add the tests
-(check `swh/lister/*/tests/`) and register the celery tasks in the main
-conftest.py (`swh/lister/core/tests/conftest.py`).
+Steps 1 and 3 are generic problems, that are often already solved by helpers or in other
+listers. That leaves us mainly to implement step 2, which is simple when the remote
+service provides an API.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+
+Skills:
+
+* object-oriented Python
+* requesting remote services through HTTP
+* scrapping if no API is offered
+
+Analysis of the target service. Prepare the following elements to write the lister:
+
+* instance names and URLs
+* requesting scheme: base URL, path, query_string, POST data, headers
+* authentication types and which one to support, if any
+* rate-limiting: HTTP codes and headers used
+* data format: JSON/XML/HTML/...?
+* mapping between remote data and needed data (ListedOrigin model, internal state)
+
+We will now walk through the steps to build a new lister.
+Please use this template to start with: :download:`new_lister_template.py`
+
+.. _lister-declaration:
+
+Lister declaration
+------------------
+
+In order to write a lister, two basic elements are required. These are the
+:py:class:`Lister` base class and the :py:class:`ListedOrigin` scheduler model class.
+Optionally, for listers that need to keep a state and support incremental listing, an
+additional object :py:class:`ListerState` will come into play.
+
+Each lister must subclass :py:class:`Lister ` either directly
+or through a subclass such as :py:class:`StatelessLister
+` for stateless ones.
+
+We extensively type-annotate our listers, as any new code, which makes proeminent that
+those lister classes are generic, and take the following parameters:
+
+* :py:class:`Lister`: the lister state type, the page type
+* :py:class:`StatelessLister`: only the page type
-Another important step is to actually run it within the
-docker-dev (:ref:`run-lister-tutorial`).
-
-This is the entire source code for the BitBucket repository lister::
-
- # Copyright (C) 2017 the Software Heritage developers
- # License: GNU General Public License version 3 or later
- # See top-level LICENSE file for more information
-
- from urllib import parse
- from swh.lister.bitbucket.models import BitBucketModel
- from swh.lister.core.indexing_lister import IndexingHttpLister
-
- class BitBucketLister(IndexingHttpLister):
- PATH_TEMPLATE = '/repositories?after=%s'
- MODEL = BitBucketModel
-
- def get_model_from_repo(self, repo):
- return {'uid': repo['uuid'],
- 'indexable': repo['created_on'],
- 'name': repo['name'],
- 'full_name': repo['full_name'],
- 'html_url': repo['links']['html']['href'],
- 'origin_url': repo['links']['clone'][0]['href'],
- 'origin_type': repo['scm'],
- 'description': repo['description']}
-
- def get_next_target_from_response(self, response):
- body = response.json()
- if 'next' in body:
- return parse.unquote(body['next'].split('after=')[1])
- else:
- return None
-
- def transport_response_simplified(self, response):
- repos = response.json()['values']
- return [self.get_model_from_repo(repo) for repo in repos]
-
-And this is the entire source code for the GitHub repository lister::
-
- # Copyright (C) 2017 the Software Heritage developers
- # License: GNU General Public License version 3 or later
- # See top-level LICENSE file for more information
-
- import time
- from swh.lister.core.indexing_lister import IndexingHttpLister
- from swh.lister.github.models import GitHubModel
-
- class GitHubLister(IndexingHttpLister):
- PATH_TEMPLATE = '/repositories?since=%d'
- MODEL = GitHubModel
-
- def get_model_from_repo(self, repo):
- return {'uid': repo['id'],
- 'indexable': repo['id'],
- 'name': repo['name'],
- 'full_name': repo['full_name'],
- 'html_url': repo['html_url'],
- 'origin_url': repo['html_url'],
- 'origin_type': 'git',
- 'description': repo['description']}
-
- def get_next_target_from_response(self, response):
- if 'next' in response.links:
- next_url = response.links['next']['url']
- return int(next_url.split('since=')[1])
- else:
- return None
-
- def transport_response_simplified(self, response):
- repos = response.json()
- return [self.get_model_from_repo(repo) for repo in repos]
-
- def request_headers(self):
- return {'Accept': 'application/vnd.github.v3+json'}
-
- def transport_quota_check(self, response):
- remain = int(response.headers['X-RateLimit-Remaining'])
- if response.status_code == 403 and remain == 0:
- reset_at = int(response.headers['X-RateLimit-Reset'])
- delay = min(reset_at - time.time(), 3600)
- return True, delay
- else:
- return False, 0
+You can can start by declaring a stateless lister and leave the implementation of state
+for later if the listing needs it. We will see how to in :ref:`handling-lister-state`.
-We can see that there are some common elements:
+Both the lister state type and the page type are user-defined types. However, while the
+page type may only exist as a type annotation, the state type for a stateful lister must
+be associated with a concrete object. The state type is commonly defined as a dataclass
+whereas the page type is often a mere annotation, potentially given a nice alias.
-* Both use the HTTP transport mixin (:class:`IndexingHttpLister
- `) just combines
- :class:`ListerHttpTransport
- ` and
- :class:`IndexingLister
- `) to get most of the
- network request functionality for free.
+Example lister declaration::
-* Both also define ``MODEL`` and ``PATH_TEMPLATE`` variables. It should be
- clear to developers that ``PATH_TEMPLATE``, when combined with the base
- service URL (e.g., ``https://some_service.com``) and passed a value (the
- 'foo' index described earlier) results in a complete identifier for making
- API requests to these services. It is required by our HTTP module.
+ NewForgePage = List[Dict[str, Any]]
-* Both services respond using JSON, so both implementations of
- ``transport_response_simplified`` are similar and quite short.
+ @dataclass
+ class NewForgeListerState:
+ ...
-We can also see that there are a few differences:
+ class NewForgeLister(Lister[NewForgeListerState, NewForgePage]):
+ LISTER_NAME = "My"
+ ...
-* GitHub sends the next URL as part of the response header, while BitBucket
- sends it in the response body.
+The new lister must declare a name through the :py:attr:`LISTER_NAME` class attribute.
-* GitHub differentiates API versions with a request header (our HTTP
- transport mix-in will automatically use any headers provided by an
- optional request_headers method that we implement here), while
- BitBucket has it as part of their base service URL. BitBucket uses
- the IETF standard HTTP 429 response code for their rate limit
- notifications (the HTTP transport mix-in automatically handles
- that), while GitHub uses their own custom response headers that need
- special treatment.
+.. _lister-construction:
-* But look at them! 58 lines of Python code, combined, to absorb all
- repositories from two of the largest and most influential source code hosting
- services.
+Lister construction
+-------------------
-Ok, so what is going on behind the scenes?
+The lister constructor is only required to ask for a :py:class:`SchedulerInterface`
+object to pass to the base class. But it does not mean that it is all that's needed for
+it to useful. A lister need information on which remote service to talk to. It needs an
+URL.
-To trace the operation of the code, let's start with a sample instantiation and
-progress from there to see which methods get called when. What follows will be
-a series of extremely reductionist pseudocode methods. This is not what the
-code actually looks like (it's not even real code), but it does have the same
-basic flow. Bear with me while I try to lay out lister operation in a
-quasi-linear way…::
+Some services are centralized and offered by a single organization. Think of Github.
+Others are offered by many people across the Internet, each using a different hosting,
+each providing specific data. Think of the many Gitlab instances. We need a name to
+identify each instance, and even if there is only one, we need its URL to access it
+concretely.
- # main task
+Now, you may think of any strategy to infer the information or hardcode it, but the base
+class needs an URL and an instance name. In any case, for a multi-instance service, you
+better be explicit and require the URL as constructor argument. We recommend the URL to
+be some form of a base URL, to be concatenated with any variable part appearing either
+because there exist multiple instances or the URL need recomputation in the listing
+process.
- ghl = GitHubLister(lister_name='github.com',
- api_baseurl='https://github.com')
- ghl.run()
+If we need any credentials to access a remote service, and do so in our polite but
+persistent fashion (remember that we want fresh information), you are encouraged to
+provide support for authenticated access. The base class support handling credentials as
+a set of identifier/secret pair. It knows how to load from a secrets store the right
+ones for the current ("lister name", "instance name") setting, if none were originally
+provided through the task parameters. You can ask for other types of access tokens in a
+separate parameter, but then you lose this advantage.
-⇓ (IndexingLister.run)::
+Example of a typical lister constructor::
- # IndexingLister.run
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ url: str,
+ instance: str,
+ credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler, url=url, instance=instance, credentials=credentials,
+ )
+ ...
- identifier = None
- do
- response, repos = ListerBase.ingest_data(identifier)
- identifier = GitHubLister.get_next_target_from_response(response)
- while(identifier)
+.. _core-lister-functionality:
-⇓ (ListerBase.ingest_data)::
+Core lister functionality
+-------------------------
- # ListerBase.ingest_data
+For the lister to contribute data to the archive, you now have to write the logic to
+fetch data from the remote service, and format it in the canonical form the scheduler
+expects, as outined in :ref:`fundamentals`. To this purpose, the two methods to
+implement are::
- response = ListerBase.safely_issue_request(identifier)
- repos = GitHubLister.transport_response_simplified(response)
- injected = ListerBase.inject_repo_data_into_db(repos)
- return response, injected
+ def get_pages(self) -> Iterator[NewForgePage]:
+ ...
-⇓ (ListerBase.safely_issue_request)::
+ def get_origins_from_page(self, page: NewForgePage) -> Iterator[ListedOrigin]:
+ ...
- # ListerBase.safely_issue_request
+Those two core functions are called by the principal lister method,
+:py:meth:`Lister.run`, found in the base class.
- repeat:
- resp = ListerHttpTransport.transport_request(identifier)
- retry, delay = ListerHttpTransport.transport_quota_check(resp)
- if retry:
- sleep(delay)
- until((not retry) or too_many_retries)
- return resp
+:py:meth:`get_pages` is the guts of the lister. It takes no arguments and must produce
+data pages. An iterator is fine here, as the :py:meth:`Lister.run` method only mean to
+iterate in a single pass on it. This method gets its input from a network request to a
+remote service's endpoint to retrieve the data we long for.
-⇓ (ListerHttpTransport.transport_request)::
+Depending on whether the data is adequately structured for our purpose can be tricky.
+Here you may have to show off your data scraping skills, or just consume a well-designed
+API. Those aspects are discussed more specifically in the section
+:ref:`handling-specific-topics`.
- # ListerHttpTransport.transport_request
+In any case, we want the data we return to be usefully filtered and structured. The
+easiest way to create an iterator is to use the `yield` keyword. Yield each data page
+you have structured in accordance with the page type you have declared. The page type
+exists only for static type checking of data passed from :py:meth:`get_pages` to
+:py:meth:`get_origins_from_page`; you can choose whatever fits the bill.
- path = ListerBase.api_baseurl
- + ListerHttpTransport.PATH_TEMPLATE % identifier
- headers = ListerHttpTransport.request_headers()
- return http.get(path, headers)
+:py:meth:`get_origins_from_page` is simpler. For each individual software origin you
+have received in the page, you convert and yield a :py:class:`ListedOrigin` model
+object. This datatype has the following mandatory fields:
-(Oh look, there's our ``PATH_TEMPLATE``)
+* lister id: you generally fill this with the value of :py:attr:`self.lister_obj.id`
-⇓ (ListerHttpTransport.request_headers)::
+* visit type: the type of software distribution format the service provides. For use by
+ a corresponding loader. It is an identifier, so you have to either use an existing
+ value or craft a new one if you get off the beaten track and tackle a new software
+ source. But then you will have to discuss the name with the core developers.
+
+ Example: Phabricator is a forge that can handle Git or SVN repositories. The visit
+ type would be "git" when listing such a repo that provides a Git URL that we can load.
+
+* origin URL: an URL that, combined with the visit type, will serve as the input of
+ loader.
+
+This datatype can also further be detailed with the optional fields:
+
+* last update date: freshness information on this origin, which is useful to the
+ scheduler for optimizing its scheduling decisions. Fill it if provided by the service,
+ at no substantial additional runtime cost, e.g. in the same request.
+
+ * extra loader arguments: extra parameters to be passed to the loader for it to be
+ able to load the origin. It is needed for example when additional context is needed
+ along with the URL to effectively load from the origin.
+
+See the definition of ListedOrigin_.
+
+Now that that we showed how those two methods operate, let's put it together by showing
+how they fit in the principal :py:meth:`Lister.run` method::
+
+ def run(self) -> ListerStats:
+
+ full_stats = ListerStats()
+
+ try:
+ for page in self.get_pages():
+ full_stats.pages += 1
+ origins = self.get_origins_from_page(page)
+ full_stats.origins += self.send_origins(origins)
+ self.commit_page(page)
+ finally:
+ self.finalize()
+ if self.updated:
+ self.set_state_in_scheduler()
+
+ return full_stats
- # ListerHttpTransport.request_headers
+:py:meth:`Lister.send_origins` is the method that sends listed origins to the scheduler.
- override → GitHubLister.request_headers
+The :py:class:`ListerState` datastructure, defined along the base lister class, is used
+to compute the number of listed pages and origins in a single lister run. It is useful
+both for the scheduler that automatically collects this information and to test the
+lister.
+
+You see that the bulk of a lister run consists in streaming data gathered from the
+remote service to the scheduler. And this is done under a ``try...finally`` construct to
+have the lister state reliably recorded in case of unhandled error. We will explain the
+role of the remaining methods and attributes appearing here in the next section as it is
+related to the lister state.
-↑↑ (ListerBase.safely_issue_request)
+.. _ListedOrigin: https://archive.softwareheritage.org/browse/swh:1:rev:03460207a17d82635ef5a6f12358392143eb9eef/?origin_url=https://forge.softwareheritage.org/source/swh-scheduler.git&path=swh/scheduler/model.py&revision=03460207a17d82635ef5a6f12358392143eb9eef#L134-L177
+
+.. _handling-lister-state:
+
+Handling lister state
+---------------------
+
+With what we have covered until now you can write a stateless lister. Unfortunately,
+some services provide too much data to efficiently deal with it in a one-shot fashion.
+Listing a given software source can take several hours or days to process. Our listers
+can also give valid output, but fail on an unexpected condition and would have to start
+over. As we want to be able to resume the listing process from a given element, provided
+by the remote service and guaranteed to be ordered, such as a date or a numeric
+identifier, we need to deal with state.
+
+The remaining part of the lister API is reserved for dealing with lister state.
+
+If the service to list has no pagination, then the data set to handle is small enough to
+not require keeping lister state. In the opposite case, you will have to determine which
+piece of information should be recorded in the lister state. As said earlier, we
+recommend declaring a dataclass for the lister state::
+
+ @dataclass
+ class NewForgeListerState:
+ current: str = ""
+
+ class NewForgeLister(Lister[NewForgeListerState, NewForgePage]):
+ ...
+
+A pair of methods, :py:meth:`state_from_dict` and :py:meth:`state_to_dict` are used to
+respectively import lister state from the scheduler and export lister state to the
+scheduler. Some fields may need help to be serialized to the scheduler, such as dates,
+so this needs to be handled there.
+
+Where is the state used? Taking the general case of a paginating service, the lister
+state is used at the beginning of the :py:meth:`get_pages` method to initialize the
+variables associated with the last listing progress. That way we can start from an
+arbitrary element, or just the first one if there is no last lister state.
+
+The :py:meth:`commit_page` is called on successful page processing, after the new
+origins are sent to the scheduler. Here you should mainly update the lister state by
+taking into account the new page processed, e.g. advance a date or serial field.
+
+Finally, upon either completion or error, the :py:meth:`finalize` is called. There you
+must set attribute :py:attr:`updated` to True if you were successful in advancing in the
+listing process. To do this you will commonly retrieve the latest saved lister state
+from the scheduler and compare with your current lister state. If lister state was
+updated, ultimately the current lister state will be recorded in the scheduler.
+
+We have now seen the stateful lister API. Note that some listers may implement more
+flexibility in the use of lister state. Some allow an `incremental` parameter that
+governs whether or not we will do a stateful listing or not. It is up to you to support
+additional functionality if it seems relevant.
+
+.. _handling-specific-topics:
+
+Handling specific topics
+------------------------
+
+Here is a quick coverage of common topics left out from lister construction and
+:py:meth:`get_pages` descriptions.
+
+Sessions
+^^^^^^^^
+
+When requesting a web service repeatedly, most parameters including headers do not
+change and could be set up once initially. We recommend setting up a e.g. HTTP session,
+as instance attribute so that further requesting code can focus on what really changes.
+Some ubiquitous HTTP headers include "Accept" to set to the service response format and
+"User-Agent" for which we provide a recommended value :py:const:`USER_AGENT` to be
+imported from :py:mod:`swh.lister`. Authentication is also commonly provided through
+headers, so you can also set it up in the session.
+
+Transport error handling
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+We generally recommend logging every unhandleable error with the response content and
+then immediately stop the listing by doing an equivalent of
+:py:meth:`Response.raise_for_status` from the `requests` library. As for rate-limiting
+errors, we have a strategy of using a flexible decorator to handle the retrying for us.
+It is based on the `tenacity` library and accessible as :py:func:`throttling_retry` from
+:py:mod:`swh.lister.utils`.
+
+Pagination
+^^^^^^^^^^
+
+This one is a moving target. You have to understand how the pagination mechanics of the
+particular service works. Some guidelines though. The identifier may be minimal (an id
+to pass as query parameter), compound (a set of such parameters) or complete (a whole
+URL). If the service provides the next URL, use it. The piece of information may be
+found either in the response body, or in a header. Once identified, you still have to
+implement the logic of requesting and extracting it in a loop and quitting the loop when
+there is no more data to fetch.
+
+Page results
+^^^^^^^^^^^^
+
+First, when retrieving page results, which involves some protocols and parsing logic,
+please make sure that any deviance from what was expected will result in an
+informational error. You also have to simplify the results, both with filtering request
+parameters if the service supports it, and by extracting from the response only the
+information needed into a structured page. This all makes for easier debugging.
+
+Testing your lister
+-------------------
+
+When developing a new lister, it's important to test. For this, add the tests
+(check `swh/lister/*/tests/`) and register the celery tasks in the main
+conftest.py (`swh/lister/core/tests/conftest.py`).
-⇓ (ListerHttpTransport.transport_quota_check)::
+Another important step is to actually run it within the docker-dev
+(:ref:`run-lister-tutorial`).
- # ListerHttpTransport.transport_quota_check
+More about listers
+------------------
- override → GitHubLister.transport_quota_check
+See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ).
-And then we're done. From start to finish, I hope this helps you understand how
-the few customized pieces fit into the new shared plumbing.
+Old (2017) lister tutorial :ref:`lister-tutorial-2017`
-Now you can go and write up a lister for a code hosting site we don't have yet!
+.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py
+.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py
+.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py
+.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py