diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py index 20e3e90..41e27a7 100644 --- a/docs/new_lister_template.py +++ b/docs/new_lister_template.py @@ -1,165 +1,165 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging from typing import Any, Dict, Iterator, List from urllib.parse import urljoin import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. NewForgeListerPage = List[Dict[str, Any]] @dataclass class NewForgeListerState: """The NewForgeLister instance state. This is used for incremental listing.""" current: str = "" """Id of the last origin listed on an incremental pass""" # If there is no need to keep state, subclass StatelessLister[NewForgeListerPage] class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]): """List origins from the "NewForge" forge.""" # Part of the lister API, that identifies this lister LISTER_NAME = "" # (Optional) CVS type of the origins listed by this lister, if constant VISIT_TYPE = "" # Instance URLs include the hostname and the common path prefix of processed URLs EXAMPLE_BASE_URL = "https://netloc/api/v1/" # Path of a specific resource to process, to join the base URL with EXAMPLE_PATH = "origins/list" def __init__( self, # Required scheduler: SchedulerInterface, # Instance URL, required for multi-instances listers (e.g gitlab, ...) url: str, # Instance name (free form) required for multi-instance listers, # or computed from `url` instance: str, # Required whether lister supports authentication or not credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState: return NewForgeListerState(**d) def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: # Do the network resource request under a retrying decorator # to handle rate limiting and transient errors up to a limit. - # `throttling_retry` by default use the `requests` library to check + # `http_retry` by default use the `requests` library to check # only for rate-limit and a base-10 exponential waiting strategy. # This can be customized by passed waiting, retrying and logging strategies # as functions. See the `tenacity` library documentation. # Log listed URL to ease debugging logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: # Log response content to ease debugging logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) # The lister must fail on blocking errors response.raise_for_status() return response def get_pages(self) -> Iterator[NewForgeListerPage]: # The algorithm depends on the service, but should request data reliably, # following pagination if relevant and yielding pages in a streaming fashion. # If incremental listing is supported, initialize from saved lister state. # Make use of any next page URL provided. # Simplify the results early to ease testing and debugging. # Initialize from the lister saved state current = "" if self.state.current is not None: current = self.state.current # Construct the URL of a service endpoint, the lister can have others to fetch url = urljoin(self.url, self.EXAMPLE_PATH) while current is not None: # Parametrize the request for incremental listing body = self.page_request(url, {"current": current}).json() # Simplify the page if possible to only the necessary elements # and yield it yield body # Get the next page parameter or end the loop when there is none current = body.get("next") def get_origins_from_page(self, page: NewForgeListerPage) -> Iterator[ListedOrigin]: """Convert a page of NewForgeLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None for element in page: yield ListedOrigin( # Required. Should use this value. lister_id=self.lister_obj.id, # Required. Visit type of the currently processed origin visit_type=self.VISIT_TYPE, # Required. URL corresponding to the origin for loaders to ingest url=..., # Should get it if the service provides it and if it induces no # substantial additional processing cost last_update=..., ) def commit_page(self, page: NewForgeListerPage) -> None: # Update the lister state to the latest `current` current = page[-1]["current"] if current > self.state.current: self.state.current = current def finalize(self) -> None: # Pull fresh lister state from the scheduler backend, in case multiple # listers run concurrently scheduler_state = self.get_state_from_scheduler() # Update the lister state in the backend only if `current` is fresher than # the one stored in the database. if self.state.current > scheduler_state.current: self.updated = True diff --git a/docs/tutorial.rst b/docs/tutorial.rst index d4ae380..c01195e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1,380 +1,380 @@ .. _lister-tutorial: Tutorial: list the content of your favorite forge in just a few steps ===================================================================== Overview -------- The three major phases of work in Software Heritage's preservation process, on the technical side, are *listing software sources*, *scheduling updates* and *loading the software artifacts into the archive*. A previous effort in 2017 consisted in designing the framework to make lister a straightforward "fill in the blanks" process, based on gained experience on the diversity found in the listed services. This is the second iteration on the lister framework design, comprising a library and an API which is easier to work with and less "magic" (read implicit). This new design is part of a larger effort in redesigning the scheduling system for the recurring tasks updating the content of the archive. .. _fundamentals: Fundamentals ------------ Fundamentally, a basic lister must follow these steps: 1. Issue a network request for a service endpoint. 2. Convert the response data into a model object. 3. Send the model object to the scheduler. Steps 1 and 3 are generic problems, that are often already solved by helpers or in other listers. That leaves us mainly to implement step 2, which is simple when the remote service provides an API. .. _prerequisites: Prerequisites ------------- Skills: * object-oriented Python * requesting remote services through HTTP * scrapping if no API is offered Analysis of the target service. Prepare the following elements to write the lister: * instance names and URLs * requesting scheme: base URL, path, query_string, POST data, headers * authentication types and which one to support, if any * rate-limiting: HTTP codes and headers used * data format: JSON/XML/HTML/...? * mapping between remote data and needed data (ListedOrigin model, internal state) We will now walk through the steps to build a new lister. Please use this template to start with: :download:`new_lister_template.py` .. _lister-declaration: Lister declaration ------------------ In order to write a lister, two basic elements are required. These are the :py:class:`Lister` base class and the :py:class:`ListedOrigin` scheduler model class. Optionally, for listers that need to keep a state and support incremental listing, an additional object :py:class:`ListerState` will come into play. Each lister must subclass :py:class:`Lister ` either directly or through a subclass such as :py:class:`StatelessLister ` for stateless ones. We extensively type-annotate our listers, as any new code, which makes proeminent that those lister classes are generic, and take the following parameters: * :py:class:`Lister`: the lister state type, the page type * :py:class:`StatelessLister`: only the page type You can can start by declaring a stateless lister and leave the implementation of state for later if the listing needs it. We will see how to in :ref:`handling-lister-state`. Both the lister state type and the page type are user-defined types. However, while the page type may only exist as a type annotation, the state type for a stateful lister must be associated with a concrete object. The state type is commonly defined as a dataclass whereas the page type is often a mere annotation, potentially given a nice alias. Example lister declaration:: NewForgePage = List[Dict[str, Any]] @dataclass class NewForgeListerState: ... class NewForgeLister(Lister[NewForgeListerState, NewForgePage]): LISTER_NAME = "My" ... The new lister must declare a name through the :py:attr:`LISTER_NAME` class attribute. .. _lister-construction: Lister construction ------------------- The lister constructor is only required to ask for a :py:class:`SchedulerInterface` object to pass to the base class. But it does not mean that it is all that's needed for it to useful. A lister need information on which remote service to talk to. It needs an URL. Some services are centralized and offered by a single organization. Think of Github. Others are offered by many people across the Internet, each using a different hosting, each providing specific data. Think of the many Gitlab instances. We need a name to identify each instance, and even if there is only one, we need its URL to access it concretely. Now, you may think of any strategy to infer the information or hardcode it, but the base class needs an URL and an instance name. In any case, for a multi-instance service, you better be explicit and require the URL as constructor argument. We recommend the URL to be some form of a base URL, to be concatenated with any variable part appearing either because there exist multiple instances or the URL need recomputation in the listing process. If we need any credentials to access a remote service, and do so in our polite but persistent fashion (remember that we want fresh information), you are encouraged to provide support for authenticated access. The base class support handling credentials as a set of identifier/secret pair. It knows how to load from a secrets store the right ones for the current ("lister name", "instance name") setting, if none were originally provided through the task parameters. You can ask for other types of access tokens in a separate parameter, but then you lose this advantage. Example of a typical lister constructor:: def __init__( self, scheduler: SchedulerInterface, url: str, instance: str, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, ) ... .. _core-lister-functionality: Core lister functionality ------------------------- For the lister to contribute data to the archive, you now have to write the logic to fetch data from the remote service, and format it in the canonical form the scheduler expects, as outined in :ref:`fundamentals`. To this purpose, the two methods to implement are:: def get_pages(self) -> Iterator[NewForgePage]: ... def get_origins_from_page(self, page: NewForgePage) -> Iterator[ListedOrigin]: ... Those two core functions are called by the principal lister method, :py:meth:`Lister.run`, found in the base class. :py:meth:`get_pages` is the guts of the lister. It takes no arguments and must produce data pages. An iterator is fine here, as the :py:meth:`Lister.run` method only mean to iterate in a single pass on it. This method gets its input from a network request to a remote service's endpoint to retrieve the data we long for. Depending on whether the data is adequately structured for our purpose can be tricky. Here you may have to show off your data scraping skills, or just consume a well-designed API. Those aspects are discussed more specifically in the section :ref:`handling-specific-topics`. In any case, we want the data we return to be usefully filtered and structured. The easiest way to create an iterator is to use the ``yield`` keyword. Yield each data page you have structured in accordance with the page type you have declared. The page type exists only for static type checking of data passed from :py:meth:`get_pages` to :py:meth:`get_origins_from_page`; you can choose whatever fits the bill. :py:meth:`get_origins_from_page` is simpler. For each individual software origin you have received in the page, you convert and yield a :py:class:`ListedOrigin` model object. This datatype has the following mandatory fields: * lister id: you generally fill this with the value of :py:attr:`self.lister_obj.id` * visit type: the type of software distribution format the service provides. For use by a corresponding loader. It is an identifier, so you have to either use an existing value or craft a new one if you get off the beaten track and tackle a new software source. But then you will have to discuss the name with the core developers. Example: Phabricator is a forge that can handle Git or SVN repositories. The visit type would be "git" when listing such a repo that provides a Git URL that we can load. * origin URL: an URL that, combined with the visit type, will serve as the input of loader. This datatype can also further be detailed with the optional fields: * last update date: freshness information on this origin, which is useful to the scheduler for optimizing its scheduling decisions. Fill it if provided by the service, at no substantial additional runtime cost, e.g. in the same request. * extra loader arguments: extra parameters to be passed to the loader for it to be able to load the origin. It is needed for example when additional context is needed along with the URL to effectively load from the origin. See the definition of :swh_web:`ListedOrigin `. Now that that we showed how those two methods operate, let's put it together by showing how they fit in the principal :py:meth:`Lister.run` method:: def run(self) -> ListerStats: full_stats = ListerStats() try: for page in self.get_pages(): full_stats.pages += 1 origins = self.get_origins_from_page(page) full_stats.origins += self.send_origins(origins) self.commit_page(page) finally: self.finalize() if self.updated: self.set_state_in_scheduler() return full_stats :py:meth:`Lister.send_origins` is the method that sends listed origins to the scheduler. The :py:class:`ListerState` datastructure, defined along the base lister class, is used to compute the number of listed pages and origins in a single lister run. It is useful both for the scheduler that automatically collects this information and to test the lister. You see that the bulk of a lister run consists in streaming data gathered from the remote service to the scheduler. And this is done under a ``try...finally`` construct to have the lister state reliably recorded in case of unhandled error. We will explain the role of the remaining methods and attributes appearing here in the next section as it is related to the lister state. .. _handling-lister-state: Handling lister state --------------------- With what we have covered until now you can write a stateless lister. Unfortunately, some services provide too much data to efficiently deal with it in a one-shot fashion. Listing a given software source can take several hours or days to process. Our listers can also give valid output, but fail on an unexpected condition and would have to start over. As we want to be able to resume the listing process from a given element, provided by the remote service and guaranteed to be ordered, such as a date or a numeric identifier, we need to deal with state. The remaining part of the lister API is reserved for dealing with lister state. If the service to list has no pagination, then the data set to handle is small enough to not require keeping lister state. In the opposite case, you will have to determine which piece of information should be recorded in the lister state. As said earlier, we recommend declaring a dataclass for the lister state:: @dataclass class NewForgeListerState: current: str = "" class NewForgeLister(Lister[NewForgeListerState, NewForgePage]): ... A pair of methods, :py:meth:`state_from_dict` and :py:meth:`state_to_dict` are used to respectively import lister state from the scheduler and export lister state to the scheduler. Some fields may need help to be serialized to the scheduler, such as dates, so this needs to be handled there. Where is the state used? Taking the general case of a paginating service, the lister state is used at the beginning of the :py:meth:`get_pages` method to initialize the variables associated with the last listing progress. That way we can start from an arbitrary element, or just the first one if there is no last lister state. The :py:meth:`commit_page` is called on successful page processing, after the new origins are sent to the scheduler. Here you should mainly update the lister state by taking into account the new page processed, e.g. advance a date or serial field. Finally, upon either completion or error, the :py:meth:`finalize` is called. There you must set attribute :py:attr:`updated` to True if you were successful in advancing in the listing process. To do this you will commonly retrieve the latest saved lister state from the scheduler and compare with your current lister state. If lister state was updated, ultimately the current lister state will be recorded in the scheduler. We have now seen the stateful lister API. Note that some listers may implement more flexibility in the use of lister state. Some allow an `incremental` parameter that governs whether or not we will do a stateful listing or not. It is up to you to support additional functionality if it seems relevant. .. _handling-specific-topics: Handling specific topics ------------------------ Here is a quick coverage of common topics left out from lister construction and :py:meth:`get_pages` descriptions. Sessions ^^^^^^^^ When requesting a web service repeatedly, most parameters including headers do not change and could be set up once initially. We recommend setting up a e.g. HTTP session, as instance attribute so that further requesting code can focus on what really changes. Some ubiquitous HTTP headers include "Accept" to set to the service response format and "User-Agent" for which we provide a recommended value :py:const:`USER_AGENT` to be imported from :py:mod:`swh.lister`. Authentication is also commonly provided through headers, so you can also set it up in the session. Transport error handling ^^^^^^^^^^^^^^^^^^^^^^^^ We generally recommend logging every unhandleable error with the response content and then immediately stop the listing by doing an equivalent of :py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting errors, we have a strategy of using a flexible decorator to handle the retrying for us. -It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from +It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from :py:mod:`swh.lister.utils`. Pagination ^^^^^^^^^^ This one is a moving target. You have to understand how the pagination mechanics of the particular service works. Some guidelines though. The identifier may be minimal (an id to pass as query parameter), compound (a set of such parameters) or complete (a whole URL). If the service provides the next URL, use it. The piece of information may be found either in the response body, or in a header. Once identified, you still have to implement the logic of requesting and extracting it in a loop and quitting the loop when there is no more data to fetch. Page results ^^^^^^^^^^^^ First, when retrieving page results, which involves some protocols and parsing logic, please make sure that any deviance from what was expected will result in an informational error. You also have to simplify the results, both with filtering request parameters if the service supports it, and by extracting from the response only the information needed into a structured page. This all makes for easier debugging. Misc files ^^^^^^^^^^ There are also a few files that need to be modified outside of the lister directory, namely: * :file:`/setup.py` to add your lister to the end of the list in the *setup* section:: entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] lister.bitbucket=swh.lister.bitbucket:register lister.cgit=swh.lister.cgit:register ...""" * :file:`/swh/lister/tests/test_cli.py` to get a default set of parameters in scheduler-related tests. * :file:`/README.md` to reference the new lister. * :file:`/CONTRIBUTORS` to add your name. Testing your lister ------------------- When developing a new lister, it's important to test. For this, add the tests (check :file:`swh/lister/*/tests/`) and register the celery tasks in the main conftest.py (:file:`swh/lister/core/tests/conftest.py`). Another important step is to actually run it within the docker-dev (:ref:`run-lister-tutorial`). More about listers ------------------ See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ). .. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py .. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py .. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py .. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index af3a3d8..a933650 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -1,501 +1,501 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging from pathlib import Path import re import tarfile import tempfile from typing import Any, Dict, Iterator, List, Optional from urllib.parse import unquote, urljoin from bs4 import BeautifulSoup import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. ArchListerPage = List[Dict[str, Any]] def size_to_bytes(size: str) -> int: """Convert human readable file size to bytes. Resulting value is an approximation as input value is in most case rounded. Args: size: A string representing a human readable file size (eg: '500K') Returns: A decimal representation of file size Examples:: >>> size_to_bytes("500") 500 >>> size_to_bytes("1K") 1000 """ units = { "K": 1000, "M": 1000**2, "G": 1000**3, "T": 1000**4, "P": 1000**5, "E": 1000**6, "Z": 1000**7, "Y": 1000**8, } if size.endswith(tuple(units)): v, u = (size[:-1], size[-1]) return int(v) * units[u] else: return int(size) class ArchLister(StatelessLister[ArchListerPage]): """List Arch linux origins from 'core', 'extra', and 'community' repositories For 'official' Arch Linux it downloads core.tar.gz, extra.tar.gz and community.tar.gz from https://archive.archlinux.org/repos/last/ extract to a temp directory and then walks through each 'desc' files. Each 'desc' file describe the latest released version of a package and helps to build an origin url from where scrapping artifacts metadata. For 'arm' Arch Linux it follow the same discovery process parsing 'desc' files. The main difference is that we can't get existing versions of an arm package because https://archlinuxarm.org does not have an 'archive' website or api. """ LISTER_NAME = "arch" VISIT_TYPE = "arch" INSTANCE = "arch" ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( "{base_url}/packages/{pkgname[0]}/{pkgname}/{filename}" ) ARCH_API_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}/json" ARM_PACKAGE_URL_PATTERN = "{base_url}/packages/{arch}/{pkgname}" ARM_PACKAGE_DOWNLOAD_URL_PATTERN = "{base_url}/{arch}/{repo}/{filename}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, flavours: Dict[str, Any] = { "official": { "archs": ["x86_64"], "repos": ["core", "extra", "community"], "base_info_url": "https://archlinux.org", "base_archive_url": "https://archive.archlinux.org", "base_mirror_url": "", "base_api_url": "https://archlinux.org", }, "arm": { "archs": ["armv7h", "aarch64"], "repos": ["core", "extra", "community"], "base_info_url": "https://archlinuxarm.org", "base_archive_url": "", "base_mirror_url": "https://uk.mirror.archlinuxarm.org", "base_api_url": "", }, }, ): super().__init__( scheduler=scheduler, credentials=credentials, url=flavours["official"]["base_info_url"], instance=self.INSTANCE, ) self.flavours = flavours self.session = requests.Session() self.session.headers.update( { "User-Agent": USER_AGENT, } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def scrap_package_versions( self, name: str, repo: str, base_url: str ) -> List[Dict[str, Any]]: """Given a package 'name' and 'repo', make an http call to origin url and parse its content to get package versions artifacts data. That method is suitable only for 'official' Arch Linux, not 'arm'. Args: name: Package name repo: The repository the package belongs to (one of self.repos) Returns: A list of dict of version Example:: [ {"url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 "arch": "x86_64", "repo": "core", "name": "dialog", "version": "1:1.3_20190211-1", "length": 180000, "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", "last_modified": "2019-02-13T08:36:00"}, ] """ url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( pkgname=name, base_url=base_url ) response = self.request_get(url=url, params={}) soup = BeautifulSoup(response.text, "html.parser") links = soup.find_all("a", href=True) # drop the first line (used to go to up directory) if links[0].attrs["href"] == "../": links.pop(0) versions = [] for link in links: # filename displayed can be cropped if name is too long, get it from href instead filename = unquote(link.attrs["href"]) if filename.endswith((".tar.xz", ".tar.zst")): # Extract arch from filename arch_rex = re.compile( rf"^{re.escape(name)}-(?P.*)-(?Pany|i686|x86_64)" rf"(.pkg.tar.(?:zst|xz))$" ) m = arch_rex.match(filename) if m is None: logger.error( "Can not find a match for architecture in %(filename)s", dict(filename=filename), ) else: arch = m.group("arch") version = m.group("version") # Extract last_modified and an approximate file size raw_text = link.next_sibling raw_text_rex = re.compile( r"^(?P\d+-\w+-\d+ \d\d:\d\d)\s+(?P\w+)$" ) s = raw_text_rex.search(raw_text.strip()) if s is None: logger.error( "Can not find a match for 'last_modified' and/or " "'size' in '%(raw_text)s'", dict(raw_text=raw_text), ) else: assert s.groups() assert len(s.groups()) == 2 last_modified_str, size = s.groups() # format as expected last_modified = datetime.datetime.strptime( last_modified_str, "%d-%b-%Y %H:%M" ).isoformat() length = size_to_bytes(size) # we want bytes # link url is relative, format a canonical one url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format( base_url=base_url, pkgname=name, filename=filename ) versions.append( dict( name=name, version=version, repo=repo, arch=arch, filename=filename, url=url, last_modified=last_modified, length=length, ) ) return versions def get_repo_archive(self, url: str, destination_path: Path) -> Path: """Given an url and a destination path, retrieve and extract .tar.gz archive which contains 'desc' file for each package. Each .tar.gz archive corresponds to an Arch Linux repo ('core', 'extra', 'community'). Args: url: url of the .tar.gz archive to download destination_path: the path on disk where to extract archive Returns: a directory Path where the archive has been extracted to. """ res = self.request_get(url=url, params={}) destination_path.parent.mkdir(parents=True, exist_ok=True) destination_path.write_bytes(res.content) extract_to = Path(str(destination_path).split(".tar.gz")[0]) tar = tarfile.open(destination_path) tar.extractall(path=extract_to) tar.close() return extract_to def parse_desc_file( self, path: Path, repo: str, base_url: str, dl_url_fmt: str, ) -> Dict[str, Any]: """Extract package information from a 'desc' file. There are subtle differences between parsing 'official' and 'arm' des files Args: path: A path to a 'desc' file on disk repo: The repo the package belongs to Returns: A dict of metadata Example:: {'api_url': 'https://archlinux.org/packages/core/x86_64/dialog/json', 'arch': 'x86_64', 'base': 'dialog', 'builddate': '1650081535', 'csize': '203028', 'desc': 'A tool to display dialog boxes from shell scripts', 'filename': 'dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', 'isize': '483988', 'license': 'LGPL2.1', 'md5sum': '06407c0cb11c50d7bf83d600f2e8107c', 'name': 'dialog', 'packager': 'Evangelos Foutras ', 'pgpsig': 'pgpsig content xxx', 'project_url': 'https://invisible-island.net/dialog/', 'provides': 'libdialog.so=15-64', 'repo': 'core', 'sha256sum': 'ef8c8971f591de7db0f455970ef5d81d5aced1ddf139f963f16f6730b1851fa7', 'url': 'https://archive.archlinux.org/packages/.all/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', # noqa: B950 'version': '1:1.3_20220414-1'} """ rex = re.compile(r"^\%(?P\w+)\%\n(?P.*)\n$", re.M) with path.open("rb") as content: parsed = rex.findall(content.read().decode()) data = {entry[0].lower(): entry[1] for entry in parsed} if "url" in data.keys(): data["project_url"] = data["url"] assert data["name"] assert data["filename"] assert data["arch"] data["repo"] = repo data["url"] = urljoin( base_url, dl_url_fmt.format( base_url=base_url, pkgname=data["name"], filename=data["filename"], arch=data["arch"], repo=repo, ), ) assert data["md5sum"] assert data["sha256sum"] data["checksums"] = { "md5sum": hash_to_hex(data["md5sum"]), "sha256sum": hash_to_hex(data["sha256sum"]), } return data def get_pages(self) -> Iterator[ArchListerPage]: """Yield an iterator sorted by name in ascending order of pages. Each page is a list of package belonging to a flavour ('official', 'arm'), and a repo ('core', 'extra', 'community') """ for name, flavour in self.flavours.items(): for arch in flavour["archs"]: for repo in flavour["repos"]: yield self._get_repo_page(name, flavour, arch, repo) def _get_repo_page( self, name: str, flavour: Dict[str, Any], arch: str, repo: str ) -> ArchListerPage: with tempfile.TemporaryDirectory() as tmpdir: page = [] if name == "official": prefix = urljoin(flavour["base_archive_url"], "/repos/last/") filename = f"{repo}.files.tar.gz" archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") destination_path = Path(tmpdir, arch, filename) base_url = flavour["base_archive_url"] dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN base_info_url = flavour["base_info_url"] info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN elif name == "arm": filename = f"{repo}.files.tar.gz" archive_url = urljoin( flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" ) destination_path = Path(tmpdir, arch, filename) base_url = flavour["base_mirror_url"] dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN base_info_url = flavour["base_info_url"] info_url_fmt = self.ARM_PACKAGE_URL_PATTERN archive = self.get_repo_archive( url=archive_url, destination_path=destination_path ) assert archive packages_desc = list(archive.glob("**/desc")) logger.debug( "Processing %(instance)s source packages info from " "%(flavour)s %(arch)s %(repo)s repository, " "(%(qty)s packages).", dict( instance=self.instance, flavour=name, arch=arch, repo=repo, qty=len(packages_desc), ), ) for package_desc in packages_desc: data = self.parse_desc_file( path=package_desc, repo=repo, base_url=base_url, dl_url_fmt=dl_url_fmt, ) assert data["builddate"] last_modified = datetime.datetime.fromtimestamp( float(data["builddate"]), tz=datetime.timezone.utc ) assert data["name"] assert data["filename"] assert data["arch"] url = info_url_fmt.format( base_url=base_info_url, pkgname=data["name"], filename=data["filename"], repo=repo, arch=data["arch"], ) assert data["version"] if name == "official": # find all versions of a package scrapping archive versions = self.scrap_package_versions( name=data["name"], repo=repo, base_url=base_url ) elif name == "arm": # There is no way to get related versions of a package, # but 'data' represents the latest released version, # use it in this case assert data["builddate"] assert data["csize"] assert data["url"] versions = [ dict( name=data["name"], version=data["version"], repo=repo, arch=data["arch"], filename=data["filename"], url=data["url"], last_modified=last_modified.replace(tzinfo=None).isoformat( timespec="seconds" ), length=int(data["csize"]), ) ] package = { "name": data["name"], "version": data["version"], "last_modified": last_modified, "url": url, "versions": versions, "data": data, } page.append(package) return page def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]: """Iterate on all arch pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for origin in page: artifacts = [] arch_metadata = [] for version in origin["versions"]: artifacts.append( { "version": version["version"], "filename": version["filename"], "url": version["url"], "length": version["length"], } ) arch_metadata.append( { "version": version["version"], "name": version["name"], "arch": version["arch"], "repo": version["repo"], "last_modified": version["last_modified"], } ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin["url"], last_update=origin["last_modified"], extra_loader_arguments={ "artifacts": artifacts, "arch_metadata": arch_metadata, }, ) diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 6a99699..1c195b2 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,198 +1,198 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime import logging import random from typing import Any, Dict, Iterator, List, Optional from urllib import parse import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class BitbucketListerState: """State of Bitbucket lister""" last_repo_cdate: Optional[datetime] = None """Creation date and time of the last listed repository during an incremental pass""" class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): """List origins from Bitbucket using its API. Bitbucket API has the following rate-limit configuration: * 60 requests per hour for anonymous users * 1000 requests per hour for authenticated users The lister is working in anonymous mode by default but Bitbucket account credentials can be provided to perform authenticated requests. """ LISTER_NAME = "bitbucket" INSTANCE = "bitbucket" API_URL = "https://api.bitbucket.org/2.0/repositories" def __init__( self, scheduler: SchedulerInterface, page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance=self.INSTANCE, ) self.incremental = incremental self.url_params: Dict[str, Any] = { "pagelen": page_size, # only return needed JSON fields in bitbucket API responses # (also prevent errors 500 when listing) "fields": ( "next,values.links.clone.href,values.scm,values.updated_on," "values.created_on" ), } self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) if len(self.credentials) > 0: cred = random.choice(self.credentials) logger.warning("Using Bitbucket credentials from user %s", cred["username"]) self.set_credentials(cred["username"], cred["password"]) else: logger.warning("No credentials set in configuration, using anonymous mode") def state_from_dict(self, d: Dict[str, Any]) -> BitbucketListerState: last_repo_cdate = d.get("last_repo_cdate") if last_repo_cdate is not None: d["last_repo_cdate"] = iso8601.parse_date(last_repo_cdate) return BitbucketListerState(**d) def state_to_dict(self, state: BitbucketListerState) -> Dict[str, Any]: d = asdict(state) last_repo_cdate = d.get("last_repo_cdate") if last_repo_cdate is not None: d["last_repo_cdate"] = last_repo_cdate.isoformat() return d def set_credentials(self, username: Optional[str], password: Optional[str]) -> None: """Set basic authentication headers with given credentials.""" if username is not None and password is not None: self.session.auth = (username, password) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) + @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def page_request(self, last_repo_cdate: str) -> requests.Response: self.url_params["after"] = last_repo_cdate logger.debug("Fetching URL %s with params %s", self.url, self.url_params) response = self.session.get(self.url, params=self.url_params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_repo_cdate: str = "1970-01-01" if ( self.incremental and self.state is not None and self.state.last_repo_cdate is not None ): last_repo_cdate = self.state.last_repo_cdate.isoformat() while True: body = self.page_request(last_repo_cdate).json() yield body["values"] next_page_url = body.get("next") if next_page_url is not None: next_page_url = parse.urlparse(next_page_url) if not next_page_url.query: logger.warning("Failed to parse url %s", next_page_url) break last_repo_cdate = parse.parse_qs(next_page_url.query)["after"][0] else: # last page break def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of Bitbucket repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in page: last_update = iso8601.parse_date(repo["updated_on"]) origin_url = repo["links"]["clone"][0]["href"] origin_type = repo["scm"] yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=origin_type, last_update=last_update, ) def commit_page(self, page: List[Dict[str, Any]]) -> None: """Update the currently stored state using the latest listed page.""" if self.incremental: last_repo = page[-1] last_repo_cdate = iso8601.parse_date(last_repo["created_on"]) if ( self.state.last_repo_cdate is None or last_repo_cdate > self.state.last_repo_cdate ): self.state.last_repo_cdate = last_repo_cdate def finalize(self) -> None: if self.incremental: scheduler_state = self.get_state_from_scheduler() if self.state.last_repo_cdate is None: return # Update the lister state in the backend only if the last seen id of # the current run is higher than that stored in the database. if ( scheduler_state.last_repo_cdate is None or self.state.last_repo_cdate > scheduler_state.last_repo_cdate ): self.updated = True diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py index f516b2b..64921e2 100644 --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -1,91 +1,91 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Iterator, List, Optional import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. BowerListerPage = List[Dict[str, str]] class BowerLister(StatelessLister[BowerListerPage]): """List Bower (Javascript package manager) origins.""" LISTER_NAME = "bower" VISIT_TYPE = "git" # Bower origins url are Git repositories INSTANCE = "bower" API_URL = "https://registry.bower.io/packages" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_URL, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[BowerListerPage]: """Yield an iterator which returns 'page' It uses the api endpoint provided by `https://registry.bower.io/packages` to get a list of package names with an origin url that corresponds to Git repository. There is only one page that list all origins urls. """ response = self.page_request(url=self.url, params={}) yield response.json() def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for entry in page: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=entry["url"], last_update=None, ) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 5ca9445..6fcfb54 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,234 +1,234 @@ # Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup import requests from requests.exceptions import HTTPError from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) Repositories = List[Dict[str, Any]] class CGitLister(StatelessLister[Repositories]): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. The lister currently defines 2 listing behaviors: - If the `base_git_url` is provided, the listed origin urls are computed out of the base git url link and the one listed in the main listed page (resulting in less HTTP queries than the 2nd behavior below). This is expected to be the main deployed behavior. - Otherwise (with no `base_git_url`), for each found git repository listed, one extra HTTP query is made at the given url found in the main listing page to gather published "Clone" URLs to be used as origin URL for that git repo. If several "Clone" urls are provided, prefer the http/https one, if any, otherwise fallback to the first one. """ LISTER_NAME = "cgit" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, base_git_url: Optional[str] = None, ): """Lister class for CGit repositories. Args: url: main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. instance: Name of cgit instance. Defaults to url's network location if unset. base_git_url: Optional base git url which allows the origin url computations. """ super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/html", "User-Agent": USER_AGENT} ) self.base_git_url = base_git_url - @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) + @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) response.raise_for_status() return BeautifulSoup(response.text, features="html.parser") def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current CGit server The last_update date is retrieved on the list of repo page to avoid to compute it on the repository details which only give a date per branch """ next_page: Optional[str] = self.url while next_page: bs_idx = self._get_and_parse(next_page) page_results = [] for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): repository_link = tr.find("a")["href"] repo_url = None git_url = None base_url = urljoin(self.url, repository_link).strip("/") if self.base_git_url: # mapping provided # computing git url git_url = base_url.replace(self.url, self.base_git_url) else: # we compute the git detailed page url from which we will retrieve # the git url (cf. self.get_origins_from_page) repo_url = base_url span = tr.find("span", {"class": re.compile("age-")}) last_updated_date = span.get("title") if span else None page_results.append( { "url": repo_url, "git_url": git_url, "last_updated_date": last_updated_date, } ) yield page_results try: pager = bs_idx.find("ul", {"class": "pager"}) current_page = pager.find("a", {"class": "current"}) if current_page: next_page = current_page.parent.next_sibling.a["href"] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page next_page = None def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of cgit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in repositories: origin_url = repo["git_url"] or self._get_origin_from_repository_url( repo["url"] ) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", last_update=_parse_last_updated_date(repo), ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" try: bs = self._get_and_parse(repository_url) except HTTPError as e: logger.warning( "Unexpected HTTP status code %s on %s", e.response.status_code, e.response.url, ) return None # check if we are on the summary tab, if not, go to this tab tab = bs.find("table", {"class": "tabs"}) if tab: summary_a = tab.find("a", string="summary") if summary_a: summary_url = urljoin(repository_url, summary_a["href"]).strip("/") if summary_url != repository_url: logger.debug( "%s : Active tab is not the summary, trying to load the summary page", repository_url, ) return self._get_origin_from_repository_url(summary_url) else: logger.debug("No summary tab found on %s", repository_url) # origin urls are listed on the repository page # TODO check if forcing https is better or not ? # # # urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: logger.debug("No git urls found on %s", repository_url) return None # look for the http/https url, if any, and use it as origin_url for url in urls: if urlparse(url).scheme in ("http", "https"): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] return origin_url def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: """Parse the last updated date""" date = repository.get("last_updated_date") if not date: return None parsed_date = None for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"): try: parsed_date = datetime.strptime(date, date_format) # force UTC to avoid naive datetime if not parsed_date.tzinfo: parsed_date = parsed_date.replace(tzinfo=timezone.utc) break except Exception: pass if not parsed_date: logger.warning( "Could not parse %s last_updated date: %s", repository["url"], date, ) return parsed_date diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py index 8e3242b..0cf59ad 100644 --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -1,176 +1,176 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import Dict, List, Tuple import pytest import requests from requests import HTTPError from swh.lister.gitea.lister import GiteaLister from swh.lister.gogs.lister import GogsListerPage from swh.scheduler.model import ListedOrigin TRYGITEA_URL = "https://try.gitea.io/api/v1/" TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?limit=3&page=1" TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?limit=3&page=2" @pytest.fixture def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]: text = Path(datadir, "https_try.gitea.io", "repos_page1").read_text() headers = { "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=TRYGITEA_P2_URL) } page_data = json.loads(text) page_result = GogsListerPage( repos=GiteaLister.extract_repos(page_data), next_link=TRYGITEA_P2_URL ) origin_urls = [r["clone_url"] for r in page_data["data"]] return text, headers, page_result, origin_urls @pytest.fixture def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]: text = Path(datadir, "https_try.gitea.io", "repos_page2").read_text() headers = { "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=TRYGITEA_P1_URL) } page_data = json.loads(text) page_result = GogsListerPage( repos=GiteaLister.extract_repos(page_data), next_link=None ) origin_urls = [r["clone_url"] for r in page_data["data"]] return text, headers, page_result, origin_urls def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. Does not test last_update.""" assert set(lister_urls) == {origin.url for origin in scheduler_origins} def test_gitea_full_listing( swh_scheduler, requests_mock, mocker, trygitea_p1, trygitea_p2 ): """Covers full listing of multiple pages, rate-limit, page size (required for test), checking page results and listed origins, statelessness.""" kwargs = dict(url=TRYGITEA_URL, instance="try_gitea", page_size=3) lister = GiteaLister(scheduler=swh_scheduler, **kwargs) lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") p1_text, p1_headers, p1_result, p1_origin_urls = trygitea_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygitea_p2 requests_mock.get(TRYGITEA_P1_URL, text=p1_text, headers=p1_headers) requests_mock.get( TRYGITEA_P2_URL, [ {"status_code": requests.codes.too_many_requests}, {"text": p2_text, "headers": p2_headers}, ], ) # end test setup stats = lister.run() # start test checks assert stats.pages == 2 assert stats.origins == 6 calls = [mocker.call(p1_result), mocker.call(p2_result)] lister.get_origins_from_page.assert_has_calls(calls) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) lister_state = lister.get_state_from_scheduler() assert lister_state.last_seen_next_link == TRYGITEA_P2_URL assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1): """Covers token authentication, token from credentials, instance inference from URL.""" api_token = "teapot" instance = "try.gitea.io" creds = {"gitea": {instance: [{"username": "u", "password": api_token}]}} kwargs1 = dict(url=TRYGITEA_URL, api_token=api_token) lister = GiteaLister(scheduler=swh_scheduler, **kwargs1) # test API token assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token kwargs2 = dict(url=TRYGITEA_URL, credentials=creds) lister = GiteaLister(scheduler=swh_scheduler, **kwargs2) # test API token from credentials assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token # test instance inference from URL assert lister.instance assert "gitea" in lister.instance # infer something related to that # setup requests mocking p1_text, p1_headers, _, _ = trygitea_p1 p1_headers["Link"] = p1_headers["Link"].replace("next", "") # only 1 page base_url = TRYGITEA_URL + lister.REPO_LIST_PATH requests_mock.get(base_url, text=p1_text, headers=p1_headers) # now check the lister runs without error stats = lister.run() assert stats.pages == 1 -@pytest.mark.parametrize("http_code", [400, 500, 502]) +@pytest.mark.parametrize("http_code", [400, 500]) def test_gitea_list_http_error( swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2 ): """Test handling of some HTTP errors commonly encountered""" lister = GiteaLister(scheduler=swh_scheduler, url=TRYGITEA_URL, page_size=3) p1_text, p1_headers, _, p1_origin_urls = trygitea_p1 p3_text, p3_headers, _, p3_origin_urls = trygitea_p2 base_url = TRYGITEA_URL + lister.REPO_LIST_PATH requests_mock.get( base_url, [ {"text": p1_text, "headers": p1_headers, "status_code": 200}, {"status_code": http_code}, {"text": p3_text, "headers": p3_headers, "status_code": 200}, ], ) # pages with fatal repositories should be skipped (no error raised) # See T4423 for more details if http_code == 500: lister.run() else: with pytest.raises(HTTPError): lister.run() # Both P1 and P3 origins should be listed in case of 500 error # While in other cases, only P1 origins should be listed scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls, scheduler_origins, ) diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 61006b0..c878788 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,265 +1,265 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, Optional, Tuple from urllib.parse import parse_qs, urlencode, urlparse import iso8601 import requests from requests.exceptions import HTTPError from requests.status_codes import codes from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import is_retryable_exception, throttling_retry +from swh.lister.utils import http_retry, is_retryable_exception from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) # Some instance provides hg_git type which can be ingested as hg origins VCS_MAPPING = {"hg_git": "hg"} @dataclass class GitLabListerState: """State of the GitLabLister""" last_seen_next_link: Optional[str] = None """Last link header (not visited yet) during an incremental pass """ Repository = Dict[str, Any] @dataclass class PageResult: """Result from a query to a gitlab project api page.""" repositories: Optional[Tuple[Repository, ...]] = None next_page: Optional[str] = None def _if_rate_limited(retry_state) -> bool: """Custom tenacity retry predicate for handling HTTP responses with status code 403 with specific ratelimit header. """ attempt = retry_state.outcome if attempt.failed: exc = attempt.exception() return ( isinstance(exc, HTTPError) and exc.response.status_code == codes.forbidden and int(exc.response.headers.get("RateLimit-Remaining", "0")) == 0 ) or is_retryable_exception(exc) return False def _parse_id_after(url: Optional[str]) -> Optional[int]: """Given an url, extract a return the 'id_after' query parameter associated value or None. This is the the repository id used for pagination purposes. """ if not url: return None # link: https://${project-api}/?...&id_after=2x... query_data = parse_qs(urlparse(url).query) page = query_data.get("id_after") if page and len(page) > 0: return int(page[0]) return None class GitLabLister(Lister[GitLabListerState, PageResult]): """List origins for a gitlab instance. By default, the lister runs in incremental mode: it lists all repositories, starting with the `last_seen_next_link` stored in the scheduler backend. Args: scheduler: a scheduler instance url: the api v4 url of the gitlab instance to visit (e.g. https://gitlab.com/api/v4/) instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...), url network location will be used if not provided incremental: defines if incremental listing is activated or not """ def __init__( self, scheduler, url: str, name: Optional[str] = "gitlab", instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, incremental: bool = False, ): if name is not None: self.LISTER_NAME = name super().__init__( scheduler=scheduler, url=url.rstrip("/"), instance=instance, credentials=credentials, ) self.incremental = incremental self.last_page: Optional[str] = None self.per_page = 100 self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) if len(self.credentials) > 0: cred = random.choice(self.credentials) logger.info( "Using %s credentials from user %s", self.instance, cred["username"] ) api_token = cred["password"] if api_token: self.session.headers["Authorization"] = f"Bearer {api_token}" def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState: return GitLabListerState(**d) def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def get_page_result(self, url: str) -> PageResult: logger.debug("Fetching URL %s", url) response = self.session.get(url) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) # GitLab API can return errors 500 when listing projects. # https://gitlab.com/gitlab-org/gitlab/-/issues/262629 # To avoid ending the listing prematurely, skip buggy URLs and move # to next pages. if response.status_code == 500: id_after = _parse_id_after(url) assert id_after is not None while True: next_id_after = id_after + self.per_page url = url.replace(f"id_after={id_after}", f"id_after={next_id_after}") response = self.session.get(url) if response.status_code == 200: break else: id_after = next_id_after else: response.raise_for_status() repositories: Tuple[Repository, ...] = tuple(response.json()) if hasattr(response, "links") and response.links.get("next"): next_page = response.links["next"]["url"] else: next_page = None return PageResult(repositories, next_page) def page_url(self, id_after: Optional[int] = None) -> str: parameters = { "pagination": "keyset", "order_by": "id", "sort": "asc", "simple": "true", "per_page": f"{self.per_page}", } if id_after is not None: parameters["id_after"] = str(id_after) return f"{self.url}/projects?{urlencode(parameters)}" def get_pages(self) -> Iterator[PageResult]: next_page: Optional[str] if self.incremental and self.state and self.state.last_seen_next_link: next_page = self.state.last_seen_next_link else: next_page = self.page_url() while next_page: self.last_page = next_page page_result = self.get_page_result(next_page) yield page_result next_page = page_result.next_page def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None repositories = page_result.repositories if page_result.repositories else [] for repo in repositories: visit_type = repo.get("vcs_type", "git") visit_type = VCS_MAPPING.get(visit_type, visit_type) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["http_url_to_repo"], visit_type=visit_type, last_update=iso8601.parse_date(repo["last_activity_at"]), ) def commit_page(self, page_result: PageResult) -> None: """Update currently stored state using the latest listed "next" page if relevant. Relevancy is determined by the next_page link whose 'page' id must be strictly superior to the currently stored one. Note: this is a noop for full listing mode """ if self.incremental: # link: https://${project-api}/?...&page=2x... next_page = page_result.next_page if not next_page and self.last_page: next_page = self.last_page if next_page: id_after = _parse_id_after(next_page) previous_next_page = self.state.last_seen_next_link previous_id_after = _parse_id_after(previous_next_page) if previous_next_page is None or ( previous_id_after and id_after and previous_id_after < id_after ): self.state.last_seen_next_link = next_page def finalize(self) -> None: """finalize the lister state when relevant (see `fn:commit_page` for details) Note: this is a noop for full listing mode """ next_page = self.state.last_seen_next_link if self.incremental and next_page: # link: https://${project-api}/?...&page=2x... next_id_after = _parse_id_after(next_page) scheduler_state = self.get_state_from_scheduler() previous_next_id_after = _parse_id_after( scheduler_state.last_seen_next_link ) if (not previous_next_id_after and next_id_after) or ( previous_next_id_after and next_id_after and previous_next_id_after < next_id_after ): self.updated = True diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py index 16d9626..17ec17f 100644 --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -1,220 +1,220 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging import random from typing import Any, Dict, Iterator, List, Optional, Tuple from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) Repo = Dict[str, Any] @dataclass class GogsListerPage: repos: Optional[List[Repo]] = None next_link: Optional[str] = None @dataclass class GogsListerState: last_seen_next_link: Optional[str] = None """Last link header (could be already visited) during an incremental pass.""" last_seen_repo_id: Optional[int] = None """Last repo id seen during an incremental pass.""" def _parse_page_id(url: Optional[str]) -> int: """Parse the page id from a Gogs page url.""" if url is None: return 0 return int(parse_qs(urlparse(url).query)["page"][0]) class GogsLister(Lister[GogsListerState, GogsListerPage]): """List origins from the Gogs Gogs API documentation: https://github.com/gogs/docs-api The API is protected behind authentication so credentials/API tokens are mandatory. It supports pagination and provides next page URL through the 'next' value of the 'Link' header. The default value for page size ('limit') is 10 but the maximum allowed value is 50. """ LISTER_NAME = "gogs" VISIT_TYPE = "git" REPO_LIST_PATH = "repos/search" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, api_token: Optional[str] = None, page_size: int = 50, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.query_params = { "limit": page_size, } self.api_token = api_token if self.api_token is None: if len(self.credentials) > 0: cred = random.choice(self.credentials) username = cred.get("username") self.api_token = cred["password"] logger.info("Using authentication credentials from user %s", username) else: # Raises an error on Gogs, or a warning on Gitea self.on_anonymous_mode() self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) if self.api_token: self.session.headers["Authorization"] = f"token {self.api_token}" def on_anonymous_mode(self): raise ValueError("No credentials or API token provided") def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: return GogsListerState(**d) def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request( self, url: str, params: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) if ( response.status_code == 500 ): # Temporary hack for skipping fatal repos (T4423) url_parts = urlparse(url) query: Dict[str, Any] = dict(parse_qsl(url_parts.query)) query.update({"page": _parse_page_id(url) + 1}) next_page_link = url_parts._replace(query=urlencode(query)).geturl() body: Dict[str, Any] = {"data": []} links = {"next": {"url": next_page_link}} return body, links else: response.raise_for_status() return response.json(), response.links @classmethod def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: fields_filter = ["id", "clone_url", "updated_at"] return [{k: r[k] for k in fields_filter} for r in body["data"]] def get_pages(self) -> Iterator[GogsListerPage]: page_id = 1 if self.state.last_seen_next_link is not None: page_id = _parse_page_id(self.state.last_seen_next_link) # base with trailing slash, path without leading slash for urljoin next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) body, links = self.page_request( next_link, {**self.query_params, "page": page_id} ) while next_link is not None: repos = self.extract_repos(body) assert len(links) > 0, "API changed: no Link header found" if "next" in links: next_link = links["next"]["url"] else: next_link = None # Happens for the last page yield GogsListerPage(repos=repos, next_link=next_link) if next_link is not None: body, links = self.page_request(next_link, {}) def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: """Convert a page of Gogs repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None assert page.repos is not None for r in page.repos: last_update = iso8601.parse_date(r["updated_at"]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=r["clone_url"], last_update=last_update, ) def commit_page(self, page: GogsListerPage) -> None: last_seen_next_link = page.next_link page_id = _parse_page_id(last_seen_next_link) state_page_id = _parse_page_id(self.state.last_seen_next_link) if page_id > state_page_id: self.state.last_seen_next_link = last_seen_next_link if (page.repos is not None) and len(page.repos) > 0: self.state.last_seen_repo_id = page.repos[-1]["id"] def finalize(self) -> None: scheduler_state = self.get_state_from_scheduler() state_page_id = _parse_page_id(self.state.last_seen_next_link) scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link) state_last_repo_id = self.state.last_seen_repo_id or 0 scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0 if (state_page_id >= scheduler_page_id) and ( state_last_repo_id > scheduler_last_repo_id ): self.updated = True # Marked updated only if it finds new repos diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py index bcac533..dfcd991 100644 --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -1,330 +1,330 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import List from unittest.mock import Mock import pytest from requests import HTTPError from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id from swh.scheduler.model import ListedOrigin TRY_GOGS_URL = "https://try.gogs.io/api/v1/" def try_gogs_page(n: int): return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3" P1 = try_gogs_page(1) P2 = try_gogs_page(2) P3 = try_gogs_page(3) P4 = try_gogs_page(4) @pytest.fixture def trygogs_p1(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() headers = {"Link": f'<{P2}>; rel="next"'} page_result = GogsListerPage( repos=GogsLister.extract_repos(json.loads(text)), next_link=P2 ) origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p2(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() headers = {"Link": f'<{P3}>; rel="next",<{P1}>; rel="prev"'} page_result = GogsListerPage( repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 ) origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p3(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() headers = {"Link": f'<{P4}>; rel="next",<{P2}>; rel="prev"'} page_result = GogsListerPage( repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 ) origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p4(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page4").read_text() headers = {"Link": f'<{P3}>; rel="prev"'} page_result = GogsListerPage( repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 ) origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p3_last(datadir): text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} page_result = GogsListerPage( repos=GogsLister.extract_repos(json.loads(text)), next_link=None ) origin_urls = [r["clone_url"] for r in page_result.repos] return text, headers, page_result, origin_urls @pytest.fixture def trygogs_p3_empty(): origins_urls = [] body = {"data": [], "ok": True} headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) text = json.dumps(body) return text, headers, page_result, origins_urls def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. Does not test last_update.""" assert set(lister_urls) == {origin.url for origin in scheduler_origins} def test_gogs_full_listing( swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3_last ): kwargs = dict( url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" ) lister = GogsLister(scheduler=swh_scheduler, **kwargs) lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last requests_mock.get(P1, text=p1_text, headers=p1_headers) requests_mock.get(P2, text=p2_text, headers=p2_headers) requests_mock.get(P3, text=p3_text, headers=p3_headers) stats = lister.run() assert stats.pages == 3 assert stats.origins == 9 calls = map(mocker.call, [p1_result, p2_result, p3_result]) lister.get_origins_from_page.assert_has_calls(list(calls)) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins ) assert ( lister.get_state_from_scheduler().last_seen_next_link == P3 ) # P3 didn't provide any next link so it remains the last_seen_next_link def test_gogs_auth_instance( swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_p3_empty ): """Covers token authentication, token from credentials, instance inference from URL.""" api_token = "secret" instance = "try_gogs" # Test lister initialization without api_token or credentials: with pytest.raises(ValueError, match="No credentials or API token provided"): kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) GogsLister(scheduler=swh_scheduler, **kwargs1) # Test lister initialization using api_token: kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) lister = GogsLister(scheduler=swh_scheduler, **kwargs2) assert lister.session.headers["Authorization"].lower() == "token %s" % api_token # Test lister initialization with credentials and run it: creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) lister = GogsLister(scheduler=swh_scheduler, **kwargs3) assert lister.session.headers["Authorization"].lower() == "token %s" % api_token assert lister.instance == "try_gogs" # setup requests mocking p1_text, p1_headers, _, _ = trygogs_p1 p2_text, p2_headers, _, _ = trygogs_p2 p3_text, p3_headers, _, _ = trygogs_p3_empty requests_mock.get(P1, text=p1_text, headers=p1_headers) requests_mock.get(P2, text=p2_text, headers=p2_headers) requests_mock.get(P3, text=p3_text, headers=p3_headers) # lister should run without any error and extract the origins stats = lister.run() assert stats.pages == 3 assert stats.origins == 6 -@pytest.mark.parametrize("http_code", [400, 500, 502]) +@pytest.mark.parametrize("http_code", [400, 500]) def test_gogs_list_http_error( swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last ): """Test handling of some HTTP errors commonly encountered""" lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 p3_text, p3_headers, _, p3_origin_urls = trygogs_p3_last base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH requests_mock.get( base_url, [ {"text": p1_text, "headers": p1_headers, "status_code": 200}, {"status_code": http_code}, {"text": p3_text, "headers": p3_headers, "status_code": 200}, ], ) # pages with fatal repositories should be skipped (no error raised) # See T4423 for more details if http_code == 500: lister.run() else: with pytest.raises(HTTPError): lister.run() # Both P1 and P3 origins should be listed in case of 500 error # While in other cases, only P1 origins should be listed scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( (p1_origin_urls + p3_origin_urls) if http_code == 500 else p1_origin_urls, scheduler_origins, ) def test_gogs_incremental_lister( swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3, trygogs_p3_last, trygogs_p3_empty, trygogs_p4, ): kwargs = dict( url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" ) lister = GogsLister(scheduler=swh_scheduler, **kwargs) lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") # First listing attempt: P1 and P2 return 3 origins each # while P3 (current last page) is empty. p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_empty requests_mock.get(P1, text=p1_text, headers=p1_headers) requests_mock.get(P2, text=p2_text, headers=p2_headers) requests_mock.get(P3, text=p3_text, headers=p3_headers) attempt1_stats = lister.run() assert attempt1_stats.pages == 3 assert attempt1_stats.origins == 6 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results lister_state = lister.get_state_from_scheduler() assert lister_state.last_seen_next_link == P3 assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] assert lister.updated check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) lister.updated = False # Reset the flag # Second listing attempt: P3 isn't empty anymore. # The lister should restart from last state and hence revisit P3. p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last requests_mock.get(P3, text=p3_text, headers=p3_headers) lister.session.get = mocker.spy(lister.session, "get") attempt2_stats = lister.run() assert attempt2_stats.pages == 1 assert attempt2_stats.origins == 3 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results page_id = _parse_page_id(lister_state.last_seen_next_link) query_params = lister.query_params query_params["page"] = page_id lister.session.get.assert_called_once_with( TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params ) # All the 9 origins (3 pages) should be passed on to the scheduler: check_listed_origins( p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins ) lister_state = lister.get_state_from_scheduler() assert lister_state.last_seen_next_link == P3 assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] assert lister.updated lister.updated = False # Reset the flag # Third listing attempt: No new origins # The lister should revisit last seen page (P3) attempt3_stats = lister.run() assert attempt3_stats.pages == 1 assert attempt3_stats.origins == 3 lister_state = lister.get_state_from_scheduler() assert lister_state.last_seen_next_link == P3 assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] assert lister.updated is False # No new origins so state isn't updated. # Fourth listing attempt: Page 4 is introduced and returns 3 new origins # The lister should revisit last seen page (P3) as well as P4. p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 # new P3 points to P4 p4_text, p4_headers, p4_result, p4_origin_urls = trygogs_p4 requests_mock.get(P3, text=p3_text, headers=p3_headers) requests_mock.get(P4, text=p4_text, headers=p4_headers) attempt4_stats = lister.run() assert attempt4_stats.pages == 2 assert attempt4_stats.origins == 6 lister_state = lister.get_state_from_scheduler() assert lister_state.last_seen_next_link == P4 assert lister_state.last_seen_repo_id == p4_result.repos[-1]["id"] assert lister.updated # All the 12 origins (4 pages) should be passed on to the scheduler: scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( p1_origin_urls + p2_origin_urls + p3_origin_urls + p4_origin_urls, scheduler_origins, ) diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 0a2f141..87ea9f8 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -1,188 +1,187 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import json import logging from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 import requests from tenacity import before_sleep_log -from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class GolangStateType: last_seen: Optional[datetime] = None """Last timestamp of a package version we have saved. Used as a starting point for an incremental listing.""" GolangPageType = List[Dict[str, Any]] class GolangLister(Lister[GolangStateType, GolangPageType]): """ List all Golang modules and send associated origins to scheduler. The lister queries the Golang module index, whose documentation can be found at https://index.golang.org """ GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" # `limit` seems to be... limited to 2000. GOLANG_MODULES_INDEX_LIMIT = 2000 LISTER_NAME = "golang" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url=self.GOLANG_MODULES_INDEX_URL, instance=self.LISTER_NAME, credentials=credentials, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) self.incremental = incremental def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: as_string = d.get("last_seen") last_seen = iso8601.parse_date(as_string) if as_string is not None else None return GolangStateType(last_seen=last_seen) def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: return { "last_seen": state.last_seen.isoformat() if state.last_seen is not None else None } def finalize(self): if self.incremental and self.state.last_seen is not None: scheduler_state = self.get_state_from_scheduler() if ( scheduler_state.last_seen is None or self.state.last_seen > scheduler_state.last_seen ): self.updated = True - @throttling_retry( - retry=retry_policy_generic, + @http_retry( before_sleep=before_sleep_log(logger, logging.WARNING), ) def api_request(self, url: str) -> List[str]: logger.debug("Fetching URL %s", url) response = self.session.get(url) if response.status_code not in (200, 304): # Log response content to ease debugging logger.warning( "Unexpected HTTP status code %s for URL %s", response.status_code, response.url, ) response.raise_for_status() return response.text.split() def get_single_page( self, since: Optional[datetime] = None ) -> Tuple[GolangPageType, Optional[datetime]]: """Return a page from the API and the timestamp of its last entry. Since all entries are sorted by chronological order, the timestamp is useful both for pagination and later for incremental runs.""" url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}" if since is not None: # The Golang index does not understand `+00:00` for some reason # and expects the "timezone zero" notation instead. This works # because all times are UTC. utc_offset = since.utcoffset() assert ( utc_offset is not None and utc_offset.total_seconds() == 0 ), "Non-UTC datetime" as_date = since.isoformat().replace("+00:00", "Z") url = f"{url}&since={as_date}" entries = self.api_request(url) page: GolangPageType = [] if not entries: return page, since for as_json in entries: entry = json.loads(as_json) timestamp = iso8601.parse_date(entry["Timestamp"]) # We've already parsed it and we'll need the datetime later, save it entry["Timestamp"] = timestamp page.append(entry) # The index is guaranteed to be sorted in chronological order since = timestamp return page, since def get_pages(self) -> Iterator[GolangPageType]: since = None if self.incremental: since = self.state.last_seen page, since = self.get_single_page(since=since) if since == self.state.last_seen: # The index returns packages whose timestamp are greater or # equal to the date provided as parameter, which will create # an infinite loop if not stopped here. return [], since if since is not None: self.state.last_seen = since while page: yield page page, since = self.get_single_page(since=since) if since == self.state.last_seen: return [], since if since is not None: self.state.last_seen = since def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: """ Iterate on all Golang projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None for module in page: path = module["Path"] # The loader will be expected to use the golang proxy to do the # actual downloading. We're using `pkg.go.dev` so that the URL points # to somewhere useful for a human instead of an (incomplete) API path. origin_url = f"https://pkg.go.dev/{path}" # Since the Go index lists versions and not just packages, there will # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, # so only the last timestamp will be used, with no duplicates. # Performance should not be an issue as they are sent to the db in bulk. yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="golang", last_update=module["Timestamp"], ) diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index b134303..e9c36fa 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,209 +1,209 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import logging from typing import Any, Dict, Iterator, Optional, Tuple import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.errors import RestfulError from lazr.restfulclient.resource import Collection from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import retry_if_exception, throttling_retry +from swh.lister.utils import http_retry, retry_if_exception from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) VcsType = str LaunchpadPageType = Tuple[VcsType, Collection] SUPPORTED_VCS_TYPES = ("git", "bzr") @dataclass class LaunchpadListerState: """State of Launchpad lister""" git_date_last_modified: Optional[datetime] = None """modification date of last updated git repository since last listing""" bzr_date_last_modified: Optional[datetime] = None """modification date of last updated bzr repository since last listing""" def origin(vcs_type: str, repo: Any) -> str: """Determine the origin url out of a repository with a given vcs_type""" return repo.git_https_url if vcs_type == "git" else repo.web_link def retry_if_restful_error(retry_state): return retry_if_exception(retry_state, lambda e: isinstance(e, RestfulError)) class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): """ List repositories from Launchpad (git or bzr). Args: scheduler: instance of SchedulerInterface incremental: defines if incremental listing should be used, in that case only modified or new repositories since last incremental listing operation will be returned """ LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, ) self.incremental = incremental self.date_last_modified: Dict[str, Optional[datetime]] = { "git": None, "bzr": None, } def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: for vcs_type in SUPPORTED_VCS_TYPES: key = f"{vcs_type}_date_last_modified" date_last_modified = d.get(key) if date_last_modified is not None: d[key] = iso8601.parse_date(date_last_modified) return LaunchpadListerState(**d) def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {} for vcs_type in SUPPORTED_VCS_TYPES: attribute_name = f"{vcs_type}_date_last_modified" d[attribute_name] = None if hasattr(state, attribute_name): date_last_modified = getattr(state, attribute_name) if date_last_modified is not None: d[attribute_name] = date_last_modified.isoformat() return d - @throttling_retry( + @http_retry( retry=retry_if_restful_error, before_sleep=before_sleep_log(logger, logging.WARNING), ) def _page_request( self, launchpad, vcs_type: str, date_last_modified: Optional[datetime] ) -> Optional[Collection]: """Querying the page of results for a given vcs_type since the date_last_modified. If some issues occurs, this will deal with the retrying policy. """ get_vcs_fns = { "git": launchpad.git_repositories.getRepositories, "bzr": launchpad.branches.getBranches, } return get_vcs_fns[vcs_type]( order_by="most neglected first", modified_since_date=date_last_modified, ) def get_pages(self) -> Iterator[LaunchpadPageType]: """ Yields an iterator on all git/bzr repositories hosted on Launchpad sorted by last modification date in ascending order. """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) if self.incremental: self.date_last_modified = { "git": self.state.git_date_last_modified, "bzr": self.state.bzr_date_last_modified, } for vcs_type in SUPPORTED_VCS_TYPES: try: result = self._page_request( launchpad, vcs_type, self.date_last_modified[vcs_type] ) except RestfulError as e: logger.warning("Listing %s origins raised %s", vcs_type, e) result = None if not result: continue yield vcs_type, result def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None vcs_type, repos = page try: for repo in repos: origin_url = origin(vcs_type, repo) # filter out origins with invalid URL if not origin_url.startswith("https://"): continue last_update = repo.date_last_modified self.date_last_modified[vcs_type] = last_update logger.debug( "Found origin %s with type %s last updated on %s", origin_url, vcs_type, last_update, ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=vcs_type, url=origin_url, last_update=last_update, ) except RestfulError as e: logger.warning("Listing %s origins raised %s", vcs_type, e) def finalize(self) -> None: git_date_last_modified = self.date_last_modified["git"] bzr_date_last_modified = self.date_last_modified["bzr"] if git_date_last_modified is None and bzr_date_last_modified is None: return if self.incremental and ( self.state.git_date_last_modified is None or ( git_date_last_modified is not None and git_date_last_modified > self.state.git_date_last_modified ) ): self.state.git_date_last_modified = git_date_last_modified if self.incremental and ( self.state.bzr_date_last_modified is None or ( bzr_date_last_modified is not None and bzr_date_last_modified > self.state.bzr_date_last_modified ) ): self.state.bzr_date_last_modified = self.date_last_modified["bzr"] self.updated = True diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 2560feb..563c0a7 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,428 +1,428 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup import lxml import requests from tenacity.before_sleep import before_sleep_log from swh.core.github.utils import GitHubSession -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) self.jar_origins: Dict[str, ListedOrigin] = {} self.github_session = GitHubSession( credentials=self.credentials, user_agent=USER_AGENT ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None response = requests.get(self.INDEX_URL, stream=True) if response.status_code != 200: logger.error("Index %s not found, stopping", self.INDEX_URL) response.raise_for_status() # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = doc_id else: m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin( self.BASE_URL, url_path, ) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom in out_pom: try: response = self.page_request(pom, {}) parsed_pom = BeautifulSoup(response.content, "xml") project = parsed_pom.find("project") if project is None: continue scm = project.find("scm") if scm is not None: connection = scm.find("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], "url": connection.text, } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", pom, ) except lxml.etree.Error as error: logger.info("Could not parse POM %s XML: %s.", pom, error) def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the page is scm. Try and detect an scm/vcs repository. Note that official format is in the form: scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put the repo url (without the "scm:type"), so we have to check against the content to extract the type and url properly. Raises AssertionError when the type of the page is not 'scm' Returns ListedOrigin with proper canonical scm url (for github) if any is found, None otherwise. """ assert page["type"] == "scm" visit_type: Optional[str] = None url: Optional[str] = None m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is None: return None scm_type = m_scm.group("type") if scm_type and scm_type in SUPPORTED_SCM_TYPES: url = m_scm.group("url") visit_type = scm_type elif page["url"].endswith(".git"): url = page["url"].lstrip("scm:") visit_type = "git" else: return None if url and visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) if not url: return None assert visit_type is not None assert self.lister_obj.id is not None return ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=visit_type, ) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Maven repositories into a list of ListedOrigins.""" if page["type"] == "scm": listed_origin = self.get_scm(page) if listed_origin: yield listed_origin else: # Origin is gathering source archives: last_update_dt = None last_update_iso = "" try: last_update_seconds = str(page["time"])[:-3] last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) last_update_dt = last_update_dt.astimezone(timezone.utc) except (OverflowError, ValueError): logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: last_update_iso = last_update_dt.isoformat() # Origin URL will target page holding sources for all versions of # an artifactId (package name) inside a groupId (namespace) path = "/".join(page["gid"].split(".")) origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") artifact = { **{k: v for k, v in page.items() if k != "doc"}, "time": last_update_iso, "base_url": self.BASE_URL, } if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=page["type"], last_update=last_update_dt, extra_loader_arguments={"artifacts": [artifact]}, ) self.jar_origins[origin_url] = jar_origin else: # Update list of source artifacts for that origin otherwise jar_origin = self.jar_origins[origin_url] artifacts = jar_origin.extra_loader_arguments["artifacts"] if artifact not in artifacts: artifacts.append(artifact) if ( jar_origin.last_update and last_update_dt and last_update_dt > jar_origin.last_update ): jar_origin.last_update = last_update_dt if not self.incremental or ( self.state and page["doc"] > self.state.last_seen_doc ): # Yield origin with updated source artifacts, multiple instances of # ListedOrigin for the same origin URL but with different artifacts # list will be sent to the scheduler but it will deduplicate them and # take the latest one to upsert in database yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 6a75a99..505afa3 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,379 +1,384 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" USER_REPO0 = "aldialimucaj/sprova4j" GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" LIST_GIT = (GIT_REPO_URL0_HTTPS,) USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) USER_REPO2 = "webx/citrus" GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}" GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}" LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() @pytest.fixture def maven_pom_1(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() @pytest.fixture def maven_pom_1_malformed(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes() @pytest.fixture def maven_pom_2(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() @pytest.fixture def maven_pom_3(datadir) -> bytes: return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() @pytest.fixture def maven_pom_multi_byte_encoding(datadir) -> bytes: return Path(datadir, "https_maven.org", "citrus-parent-3.0.7.pom").read_bytes() @pytest.fixture def requests_mock(requests_mock): """If github api calls for the configured scm repository, returns its canonical url.""" for url_api, url_html in [ (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), (GIT_REPO_URL2_API, GIT_REPO_URL2_HTTPS), ]: requests_mock.get( url_api, json={"html_url": url_html}, ) yield requests_mock @pytest.fixture(autouse=True) def network_requests_mock( requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): requests_mock.get(INDEX_URL, content=maven_index_full) requests_mock.get(URL_POM_1, content=maven_pom_1) requests_mock.get(URL_POM_2, content=maven_pom_2) requests_mock.get(URL_POM_3, content=maven_pom_3) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(MavenLister.page_request.retry, "sleep") + + def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, maven_pom_1_malformed, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. requests_mock.get(URL_POM_1, content=maven_pom_1_malformed) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert len(origin_urls) == 2 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32")) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 def test_maven_list_pom_multi_byte_encoding( swh_scheduler, requests_mock, maven_pom_multi_byte_encoding ): """should parse POM file with multi-byte encoding.""" # replace pom file with a multi-byte encoding one requests_mock.get(URL_POM_1, content=maven_pom_multi_byte_encoding) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 3 diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index dfc6561..8d48873 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,190 +1,190 @@ # Copyright (C) 2018-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging from typing import Any, Dict, Iterator, List, Optional import iso8601 import requests from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) @dataclass class NpmListerState: """State of npm lister""" last_seq: Optional[int] = None class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): """ List all packages hosted on the npm registry. The lister is based on the npm replication API powered by a CouchDB database (https://docs.couchdb.org/en/stable/api/database/). Args: scheduler: a scheduler instance page_size: number of packages info to return per page when querying npm API incremental: defines if incremental listing should be used, in that case only modified or new packages since last incremental listing operation will be returned, otherwise all packages will be listed in lexicographical order """ LISTER_NAME = "npm" INSTANCE = "npm" API_BASE_URL = "https://replicate.npmjs.com" API_INCREMENTAL_LISTING_URL = f"{API_BASE_URL}/_changes" API_FULL_LISTING_URL = f"{API_BASE_URL}/_all_docs" PACKAGE_URL_TEMPLATE = "https://www.npmjs.com/package/{package_name}" def __init__( self, scheduler: SchedulerInterface, page_size: int = 1000, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_INCREMENTAL_LISTING_URL if incremental else self.API_FULL_LISTING_URL, instance=self.INSTANCE, ) self.page_size = page_size if not incremental: # in full listing mode, first package in each page corresponds to the one # provided as the startkey query parameter value, so we increment the page # size by one to avoid double package processing self.page_size += 1 self.incremental = incremental self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) def state_from_dict(self, d: Dict[str, Any]) -> NpmListerState: return NpmListerState(**d) def state_to_dict(self, state: NpmListerState) -> Dict[str, Any]: return asdict(state) def request_params(self, last_package_id: str) -> Dict[str, Any]: # include package JSON document to get its last update date params = {"limit": self.page_size, "include_docs": "true"} if self.incremental: params["since"] = last_package_id else: params["startkey"] = last_package_id return params - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, last_package_id: str) -> requests.Response: params = self.request_params(last_package_id) logger.debug("Fetching URL %s with params %s", self.url, params) response = self.session.get(self.url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_package_id: str = "0" if self.incremental else '""' if ( self.incremental and self.state is not None and self.state.last_seq is not None ): last_package_id = str(self.state.last_seq) while True: response = self.page_request(last_package_id) data = response.json() page = data["results"] if self.incremental else data["rows"] if not page: break if self.incremental or len(page) < self.page_size: yield page else: yield page[:-1] if len(page) < self.page_size: break last_package_id = ( str(page[-1]["seq"]) if self.incremental else f'"{page[-1]["id"]}"' ) def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of Npm repositories into a list of ListedOrigin.""" assert self.lister_obj.id is not None for package in page: # no source code to archive here if not package["doc"].get("versions", {}): continue package_name = package["doc"]["name"] package_latest_version = ( package["doc"].get("dist-tags", {}).get("latest", "") ) last_update = None if package_latest_version in package["doc"].get("time", {}): last_update = iso8601.parse_date( package["doc"]["time"][package_latest_version] ) yield ListedOrigin( lister_id=self.lister_obj.id, url=self.PACKAGE_URL_TEMPLATE.format(package_name=package_name), visit_type="npm", last_update=last_update, ) def commit_page(self, page: List[Dict[str, Any]]): """Update the currently stored state using the latest listed page.""" if self.incremental: last_package = page[-1] last_seq = last_package["seq"] if self.state.last_seq is None or last_seq > self.state.last_seq: self.state.last_seq = last_seq def finalize(self): if self.incremental and self.state.last_seq is not None: scheduler_state = self.get_state_from_scheduler() if ( scheduler_state.last_seq is None or self.state.last_seq > scheduler_state.last_seq ): self.updated = True diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py index 1c20b33..2d41b59 100644 --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -1,207 +1,212 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from itertools import chain import json from pathlib import Path import iso8601 import pytest from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.npm.lister import NpmLister, NpmListerState @pytest.fixture def npm_full_listing_page1(datadir): return json.loads(Path(datadir, "npm_full_page1.json").read_text()) @pytest.fixture def npm_full_listing_page2(datadir): return json.loads(Path(datadir, "npm_full_page2.json").read_text()) @pytest.fixture def npm_incremental_listing_page1(datadir): return json.loads(Path(datadir, "npm_incremental_page1.json").read_text()) @pytest.fixture def npm_incremental_listing_page2(datadir): return json.loads(Path(datadir, "npm_incremental_page2.json").read_text()) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(NpmLister.page_request.retry, "sleep") + + def _check_listed_npm_packages(lister, packages, scheduler_origins): for package in packages: package_name = package["doc"]["name"] latest_version = package["doc"]["dist-tags"]["latest"] package_last_update = iso8601.parse_date(package["doc"]["time"][latest_version]) origin_url = lister.PACKAGE_URL_TEMPLATE.format(package_name=package_name) scheduler_origin = [o for o in scheduler_origins if o.url == origin_url] assert scheduler_origin assert scheduler_origin[0].last_update == package_last_update def _match_request(request): return request.headers.get("User-Agent") == USER_AGENT def _url_params(page_size, **kwargs): params = {"limit": page_size, "include_docs": "true"} params.update(**kwargs) return params def test_npm_lister_full( swh_scheduler, requests_mock, mocker, npm_full_listing_page1, npm_full_listing_page2 ): """Simulate a full listing of four npm packages in two pages""" page_size = 2 lister = NpmLister(scheduler=swh_scheduler, page_size=page_size, incremental=False) requests_mock.get( lister.API_FULL_LISTING_URL, [ {"json": npm_full_listing_page1}, {"json": npm_full_listing_page2}, ], additional_matcher=_match_request, ) spy_get = mocker.spy(lister.session, "get") stats = lister.run() assert stats.pages == 2 assert stats.origins == page_size * stats.pages spy_get.assert_has_calls( [ mocker.call( lister.API_FULL_LISTING_URL, params=_url_params(page_size + 1, startkey='""'), ), mocker.call( lister.API_FULL_LISTING_URL, params=_url_params( page_size + 1, startkey=f'"{npm_full_listing_page1["rows"][-1]["id"]}"', ), ), ] ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results _check_listed_npm_packages( lister, chain(npm_full_listing_page1["rows"][:-1], npm_full_listing_page2["rows"]), scheduler_origins, ) assert lister.get_state_from_scheduler() == NpmListerState() def test_npm_lister_incremental( swh_scheduler, requests_mock, mocker, npm_incremental_listing_page1, npm_incremental_listing_page2, ): """Simulate an incremental listing of four npm packages in two pages""" page_size = 2 lister = NpmLister(scheduler=swh_scheduler, page_size=page_size, incremental=True) requests_mock.get( lister.API_INCREMENTAL_LISTING_URL, [ {"json": npm_incremental_listing_page1}, {"json": npm_incremental_listing_page2}, {"json": {"results": []}}, ], additional_matcher=_match_request, ) spy_get = mocker.spy(lister.session, "get") assert lister.get_state_from_scheduler() == NpmListerState() stats = lister.run() assert stats.pages == 2 assert stats.origins == page_size * stats.pages last_seq = npm_incremental_listing_page2["results"][-1]["seq"] spy_get.assert_has_calls( [ mocker.call( lister.API_INCREMENTAL_LISTING_URL, params=_url_params(page_size, since="0"), ), mocker.call( lister.API_INCREMENTAL_LISTING_URL, params=_url_params( page_size, since=str(npm_incremental_listing_page1["results"][-1]["seq"]), ), ), mocker.call( lister.API_INCREMENTAL_LISTING_URL, params=_url_params(page_size, since=str(last_seq)), ), ] ) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results _check_listed_npm_packages( lister, chain( npm_incremental_listing_page1["results"], npm_incremental_listing_page2["results"], ), scheduler_origins, ) assert lister.get_state_from_scheduler() == NpmListerState(last_seq=last_seq) def test_npm_lister_incremental_restart( swh_scheduler, requests_mock, mocker, ): """Check incremental npm listing will restart from saved state""" page_size = 2 last_seq = 67 lister = NpmLister(scheduler=swh_scheduler, page_size=page_size, incremental=True) lister.state = NpmListerState(last_seq=last_seq) requests_mock.get(lister.API_INCREMENTAL_LISTING_URL, json={"results": []}) spy_get = mocker.spy(lister.session, "get") lister.run() spy_get.assert_called_with( lister.API_INCREMENTAL_LISTING_URL, params=_url_params(page_size, since=str(last_seq)), ) def test_npm_lister_http_error( swh_scheduler, requests_mock, mocker, ): lister = NpmLister(scheduler=swh_scheduler) requests_mock.get(lister.API_FULL_LISTING_URL, status_code=500) with pytest.raises(HTTPError): lister.run() diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index a17ad0e..77cb4de 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -1,125 +1,125 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Iterator, List, Optional import iso8601 import requests from requests.exceptions import HTTPError from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import __version__ from ..pattern import CredentialsType, StatelessLister # https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers USER_AGENT = ( f"Software Heritage PubDev Lister v{__version__} " "(+https://www.softwareheritage.org/contact)" ) logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. PubDevListerPage = List[str] class PubDevLister(StatelessLister[PubDevListerPage]): """List pub.dev (Dart, Flutter) origins.""" LISTER_NAME = "pubdev" VISIT_TYPE = "pubdev" INSTANCE = "pubdev" BASE_URL = "https://pub.dev/" PACKAGE_NAMES_URL_PATTERN = "{base_url}api/package-names" PACKAGE_INFO_URL_PATTERN = "{base_url}api/packages/{pkgname}" ORIGIN_URL_PATTERN = "{base_url}packages/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package origins. The http api call get "{base_url}package-names" to retrieve a sorted list of all package names. There is only one page that list all origins url based on "{base_url}packages/{pkgname}" """ response = self.page_request( url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} ) yield response.json()["packages"] def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for pkgname in page: package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) try: response = self.page_request(url=package_info_url, params={}) except HTTPError: logger.warning( "Failed to fetch metadata for package %s, skipping it from listing.", pkgname, ) continue package_metadata = response.json() package_versions = package_metadata["versions"] last_published = max( package_version["published"] for package_version in package_versions ) origin_url = self.ORIGIN_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin_url, last_update=iso8601.parse_date(last_published), ) diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index eefd797..443c21d 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,177 +1,177 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging from time import sleep from typing import Any, Dict, Iterator, List, Optional, Tuple from xmlrpc.client import Fault, ServerProxy from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Type returned by the XML-RPC changelog call: # package, version, release timestamp, description, serial ChangelogEntry = Tuple[str, str, int, str, int] # Manipulated package updated type which is a subset information # of the ChangelogEntry type: package, max release date PackageUpdate = Tuple[str, datetime] # Type returned by listing a page of results PackageListPage = List[PackageUpdate] @dataclass class PyPIListerState: """State of PyPI lister""" last_serial: Optional[int] = None """Last seen serial when visiting the pypi instance""" def _if_rate_limited(retry_state) -> bool: """Custom tenacity retry predicate to handle xmlrpc client error: .. code:: xmlrpc.client.Fault: """ attempt = retry_state.outcome return attempt.failed and isinstance(attempt.exception(), Fault) def pypi_url(package_name: str) -> str: """Build pypi url out of a package name.""" return PyPILister.PACKAGE_URL.format(package_name=package_name) class PyPILister(Lister[PyPIListerState, PackageListPage]): """List origins from PyPI.""" LISTER_NAME = "pypi" INSTANCE = "pypi" # As of today only the main pypi.org is used PACKAGE_LIST_URL = "https://pypi.org/pypi" # XML-RPC url PACKAGE_URL = "https://pypi.org/project/{package_name}/" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, url=self.PACKAGE_LIST_URL, instance=self.INSTANCE, credentials=credentials, ) # used as termination condition and if useful, becomes the new state when the # visit is done self.last_processed_serial: Optional[int] = None def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState: return PyPIListerState(last_serial=d.get("last_serial")) def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_last_serial(self, client: ServerProxy) -> int: """Internal detail to allow throttling when calling the changelog last entry""" serial = client.changelog_last_serial() assert isinstance(serial, int) return serial - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_since_serial( self, client: ServerProxy, serial: int ) -> List[ChangelogEntry]: """Internal detail to allow throttling when calling the changelog listing""" sleep(1) # to avoid the initial warning about throttling return client.changelog_since_serial(serial) # type: ignore def get_pages(self) -> Iterator[PackageListPage]: """Iterate other changelog events per package, determine the max release date for that package and use that max release date as last_update. When the execution is done, this will also set the self.last_processed_serial attribute so we can finalize the state of the lister for the next visit. Yields: List of Tuple of (package-name, max release-date) """ client = ServerProxy(self.url) last_processed_serial = -1 if self.state.last_serial is not None: last_processed_serial = self.state.last_serial upstream_last_serial = self._changelog_last_serial(client) # Paginate through result of pypi, until we read everything while last_processed_serial < upstream_last_serial: updated_packages = defaultdict(list) for package, _, release_date, _, serial in self._changelog_since_serial( client, last_processed_serial ): updated_packages[package].append(release_date) # Compute the max serial so we can stop when done last_processed_serial = max(last_processed_serial, serial) # Returns pages of result to flush regularly yield [ ( pypi_url(package), datetime.fromtimestamp(max(release_dates)).replace( tzinfo=timezone.utc ), ) for package, release_dates in updated_packages.items() ] self.last_processed_serial = upstream_last_serial def get_origins_from_page( self, packages: PackageListPage ) -> Iterator[ListedOrigin]: """Convert a page of PyPI repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for origin, last_update in packages: yield ListedOrigin( lister_id=self.lister_obj.id, url=origin, visit_type="pypi", last_update=last_update, ) def finalize(self): """Finalize the visit state by updating with the new last_serial if updates actually happened. """ self.updated = ( self.state and self.state.last_serial and self.last_processed_serial and self.state.last_serial < self.last_processed_serial ) or (not self.state.last_serial and self.last_processed_serial) if self.updated: self.state.last_serial = self.last_processed_serial diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index dcc30c3..8bc56ee 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,457 +1,456 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass, field import datetime from enum import Enum import logging import re from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from xml.etree import ElementTree from bs4 import BeautifulSoup import iso8601 import lxml import requests from tenacity.before_sleep import before_sleep_log from swh.core.api.classes import stream_results -from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) class VcsNames(Enum): """Used to filter SourceForge tool names for valid VCS types""" # CVS projects are read-only CVS = "cvs" GIT = "git" SUBVERSION = "svn" MERCURIAL = "hg" BAZAAR = "bzr" VCS_NAMES = set(v.value for v in VcsNames.__members__.values()) @dataclass class SourceForgeListerEntry: vcs: VcsNames url: str last_modified: datetime.date SubSitemapNameT = str ProjectNameT = str # SourceForge only offers day-level granularity, which is good enough for our purposes LastModifiedT = datetime.date @dataclass class SourceForgeListerState: """Current state of the SourceForge lister in incremental runs""" """If the subsitemap does not exist, we assume a full run of this subsitemap is needed. If the date is the same, we skip the subsitemap, otherwise we request the subsitemap and look up every project's "last modified" date to compare against `ListedOrigins` from the database.""" subsitemap_last_modified: Dict[SubSitemapNameT, LastModifiedT] = field( default_factory=dict ) """Some projects (not the majority, but still meaningful) have no VCS for us to archive. We need to remember a mapping of their API URL to their "last modified" date so we don't keep querying them needlessly every time.""" empty_projects: Dict[str, LastModifiedT] = field(default_factory=dict) SourceForgeListerPage = List[SourceForgeListerEntry] MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" # API resource endpoint for information about the given project. # # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe` # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" # Predictable URL for cloning (in the broad sense) a VCS registered for the project. # # Warning: does not apply to bzr repos, and Mercurial are http only, see use of this # constant below. # # `vcs`: VCS type, one of `VCS_NAMES` # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe`. # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses # `git` (https://git.code.sf.net/p/codeblocks/git). CLONE_URL_FORMAT = "https://{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" PROJ_URL_RE = re.compile( r"^https://sourceforge.net/(?P[^/]+)/(?P[^/]+)/(?P.*)?" ) # Mapping of `(namespace, project name)` to `last modified` date. ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT] class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): """List origins from the "SourceForge" forge.""" # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main", credentials=credentials, ) # Will hold the currently saved "last modified" dates to compare against our # requests. self._project_last_modified: Optional[ProjectsLastModifiedCache] = None self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) self.incremental = incremental def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: subsitemaps = { k: datetime.date.fromisoformat(v) for k, v in d.get("subsitemap_last_modified", {}).items() } empty_projects = { k: datetime.date.fromisoformat(v) for k, v in d.get("empty_projects", {}).items() } return SourceForgeListerState( subsitemap_last_modified=subsitemaps, empty_projects=empty_projects ) def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]: return { "subsitemap_last_modified": { k: v.isoformat() for k, v in state.subsitemap_last_modified.items() }, "empty_projects": { k: v.isoformat() for k, v in state.empty_projects.items() }, } def projects_last_modified(self) -> ProjectsLastModifiedCache: if not self.incremental: # No point in loading the previous results if we're doing a full run return {} if self._project_last_modified is not None: return self._project_last_modified # We know there will be at least that many origins stream = stream_results( self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000 ) listed_origins = dict() # Projects can have slashes in them if they're subprojects, but the # mointpoint (last component) cannot. url_match = re.compile( r".*\.code\.sf\.net/(?P[^/]+)/(?P.+)/.*" ) bzr_url_match = re.compile( r"http://(?P[^/]+).bzr.sourceforge.net/bzr/([^/]+)" ) cvs_url_match = re.compile( r"rsync://a.cvs.sourceforge.net/cvsroot/(?P.+)/([^/]+)" ) for origin in stream: url = origin.url match = url_match.match(url) if match is None: # Could be a bzr or cvs special endpoint bzr_match = bzr_url_match.match(url) cvs_match = cvs_url_match.match(url) matches = None if bzr_match is not None: matches = bzr_match.groupdict() elif cvs_match is not None: matches = cvs_match.groupdict() assert matches project = matches["project"] namespace = "p" # no special namespacing for bzr and cvs projects else: matches = match.groupdict() namespace = matches["namespace"] project = matches["project"] # "Last modified" dates are the same across all VCS (tools, even) # within a project or subproject. An assertion here would be overkill. last_modified = origin.last_update assert last_modified is not None listed_origins[(namespace, project)] = last_modified.date() self._project_last_modified = listed_origins return listed_origins - @throttling_retry( - retry=retry_policy_generic, + @http_retry( before_sleep=before_sleep_log(logger, logging.WARNING), ) def page_request(self, url, params) -> requests.Response: # Log listed URL to ease debugging logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: # Log response content to ease debugging logger.warning( "Unexpected HTTP status code %s for URL %s", response.status_code, response.url, ) # The lister must fail on blocking errors response.raise_for_status() return response def get_pages(self) -> Iterator[SourceForgeListerPage]: """ SourceForge has a main XML sitemap that lists its sharded sitemaps for all projects. Each XML sub-sitemap lists project pages, which are not unique per project: a project can have a wiki, a home, a git, an svn, etc. For each unique project, we query an API endpoint that lists (among other things) the tools associated with said project, some of which are the VCS used. Subprojects are considered separate projects. Lastly we use the information of which VCS are used to build the predictable clone URL for any given VCS. """ sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text tree = ElementTree.fromstring(sitemap_contents) for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_el is not None and last_modified_el.text is not None last_modified = datetime.date.fromisoformat(last_modified_el.text) location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None and location.text is not None sub_url = location.text if self.incremental: recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) if recorded_last_mod == last_modified: # The entire subsitemap hasn't changed, so none of its projects # have either, skip it. continue self.state.subsitemap_last_modified[sub_url] = last_modified subsitemap_contents = self.page_request(sub_url, {}).text subtree = ElementTree.fromstring(subsitemap_contents) yield from self._get_pages_from_subsitemap(subtree) def get_origins_from_page( self, page: SourceForgeListerPage ) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for hit in page: last_modified: str = str(hit.last_modified) last_update: datetime.datetime = iso8601.parse_date(last_modified) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=hit.vcs.value, url=hit.url, last_update=last_update, ) def _get_pages_from_subsitemap( self, subtree: ElementTree.Element ) -> Iterator[SourceForgeListerPage]: projects: Set[ProjectNameT] = set() for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_block is not None last_modified = last_modified_block.text location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None project_url = location.text assert project_url is not None match = PROJ_URL_RE.match(project_url) if match: matches = match.groupdict() namespace = matches["namespace"] if namespace == "projects": # These have a `p`-namespaced counterpart, use that instead continue project = matches["project"] rest = matches["rest"] if rest.count("/") > 1: # This is a subproject. There exists no sub-subprojects. subproject_name = rest.rsplit("/", 2)[0] project = f"{project}/{subproject_name}" prev_len = len(projects) projects.add(project) if prev_len == len(projects): # Already seen continue pages = self._get_pages_for_project(namespace, project, last_modified) if pages: yield pages else: logger.debug("Project '%s' does not have any VCS", project) else: # Should almost always match, let's log it # The only ones that don't match are mostly specialized one-off URLs. msg = "Project URL '%s' does not match expected pattern" logger.warning(msg, project_url) def _get_pages_for_project( self, namespace, project, last_modified ) -> SourceForgeListerPage: endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) empty_project_last_modified = self.state.empty_projects.get(endpoint) if empty_project_last_modified is not None: if last_modified == empty_project_last_modified.isoformat(): # Project has not changed, so is still empty, meaning it has # no VCS attached that we can archive. logger.debug(f"Project {namespace}/{project} is still empty") return [] if self.incremental: expected = self.projects_last_modified().get((namespace, project)) if expected is not None: if expected.isoformat() == last_modified: # Project has not changed logger.debug(f"Project {namespace}/{project} has not changed") return [] else: logger.debug(f"Project {namespace}/{project} was updated") else: msg = "New project during an incremental run: %s/%s" logger.debug(msg, namespace, project) try: res = self.page_request(endpoint, {}).json() except requests.HTTPError: # We've already logged in `page_request` return [] tools = res.get("tools") if tools is None: # This rarely happens, on very old URLs logger.warning("Project '%s' does not have any tools", endpoint) return [] hits = [] for tool in tools: tool_name = tool["name"] if tool_name not in VCS_NAMES: continue if tool_name == VcsNames.CVS.value: # CVS projects are different from other VCS ones, they use the rsync # protocol, a list of modules needs to be fetched from an info page # and multiple origin URLs can be produced for a same project. cvs_info_url = f"http://{project}.cvs.sourceforge.net" try: response = self.page_request(cvs_info_url, params={}) except requests.HTTPError: logger.warning( "CVS info page could not be fetched, skipping project '%s'", project, ) continue else: bs = BeautifulSoup(response.text, features="html.parser") cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot" for text in [b.text for b in bs.find_all("b")]: match = re.search(rf".*/cvsroot/{project} co -P (.+)", text) if match is not None: module = match.group(1) if module != "Attic": url = f"{cvs_base_url}/{project}/{module}" hits.append( SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified, ) ) continue url = CLONE_URL_FORMAT.format( vcs=tool_name, namespace=namespace, project=project, mount_point=tool["mount_point"], ) if tool_name == VcsNames.MERCURIAL.value: # SourceForge does not yet support anonymous HTTPS cloning for Mercurial # See https://sourceforge.net/p/forge/feature-requests/727/ url = url.replace("https://", "http://") if tool_name == VcsNames.BAZAAR.value: # SourceForge has removed support for bzr and only keeps legacy projects # around at a separate (also not https) URL. Bzr projects are very rare # and a lot of them are 404 now. url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" try: response = self.page_request(url, params={}) if "To get this branch, use:" not in response.text: # If a bzr project has multiple branches, we need to extract their # names from the repository landing page and create one listed origin # per branch parser = lxml.etree.HTMLParser() tree = lxml.etree.fromstring(response.text, parser) # Get all tds with class 'autcell' tds = tree.xpath(".//td[contains(@class, 'autcell')]") for td in tds: branch = td.findtext("a") # If the td's parent contains Branch and # it has non-empty text: if td.xpath("..//img[@alt='Branch']") and branch: hits.append( SourceForgeListerEntry( vcs=VcsNames(tool_name), url=f"{url}/{branch}", last_modified=last_modified, ) ) continue except requests.HTTPError: logger.warning( "Bazaar repository page could not be fetched, skipping project '%s'", project, ) continue entry = SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified ) hits.append(entry) if not hits: date = datetime.date.fromisoformat(last_modified) self.state.empty_projects[endpoint] = date else: self.state.empty_projects.pop(endpoint, None) return hits diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index 6d9b50d..98b376f 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,126 +1,130 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import requests from requests.status_codes import codes from tenacity.wait import wait_fixed -from swh.lister.utils import ( - MAX_NUMBER_ATTEMPTS, - WAIT_EXP_BASE, - split_range, - throttling_retry, -) +from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range @pytest.mark.parametrize( "total_pages,nb_pages,expected_ranges", [ (14, 5, [(0, 4), (5, 9), (10, 14)]), (19, 10, [(0, 9), (10, 19)]), (20, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)]), ( 21, 3, [ (0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21), ], ), ], ) def test_split_range(total_pages, nb_pages, expected_ranges): actual_ranges = list(split_range(total_pages, nb_pages)) assert actual_ranges == expected_ranges @pytest.mark.parametrize("total_pages,nb_pages", [(None, 1), (100, None)]) def test_split_range_errors(total_pages, nb_pages): for total_pages, nb_pages in [(None, 1), (100, None)]: with pytest.raises(TypeError): next(split_range(total_pages, nb_pages)) TEST_URL = "https://example.og/api/repositories" -@throttling_retry() +@http_retry() def make_request(): response = requests.get(TEST_URL) response.raise_for_status() return response def assert_sleep_calls(mocker, mock_sleep, sleep_params): mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) -def test_throttling_retry(requests_mock, mocker): +@pytest.mark.parametrize( + "status_code", + [ + codes.too_many_requests, + codes.internal_server_error, + codes.bad_gateway, + codes.service_unavailable, + ], +) +def test_http_retry(requests_mock, mocker, status_code): data = {"result": {}} requests_mock.get( TEST_URL, [ - {"status_code": codes.too_many_requests}, - {"status_code": codes.too_many_requests}, + {"status_code": status_code}, + {"status_code": status_code}, {"status_code": codes.ok, "json": data}, ], ) mock_sleep = mocker.patch.object(make_request.retry, "sleep") response = make_request() assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) assert response.json() == data -def test_throttling_retry_max_attemps(requests_mock, mocker): +def test_http_retry_max_attemps(requests_mock, mocker): requests_mock.get( TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS), ) mock_sleep = mocker.patch.object(make_request.retry, "sleep") with pytest.raises(requests.exceptions.HTTPError) as e: make_request() assert e.value.response.status_code == codes.too_many_requests assert_sleep_calls( mocker, mock_sleep, [float(WAIT_EXP_BASE**i) for i in range(MAX_NUMBER_ATTEMPTS - 1)], ) -@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE)) +@http_retry(wait=wait_fixed(WAIT_EXP_BASE)) def make_request_wait_fixed(): response = requests.get(TEST_URL) response.raise_for_status() return response -def test_throttling_retry_wait_fixed(requests_mock, mocker): +def test_http_retry_wait_fixed(requests_mock, mocker): requests_mock.get( TEST_URL, [ {"status_code": codes.too_many_requests}, {"status_code": codes.too_many_requests}, {"status_code": codes.ok}, ], ) mock_sleep = mocker.patch.object(make_request_wait_fixed.retry, "sleep") make_request_wait_fixed() assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2) diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py index 179329a..7e0b800 100644 --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -1,150 +1,150 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] class TuleapLister(StatelessLister[RepoPage]): """List origins from Tuleap. Tuleap provides SVN and Git repositories hosting. Tuleap API getting started: https://tuleap.net/doc/en/user-guide/integration/rest.html Tuleap API reference: https://tuleap.net/api/explorer/ Using the API we first request a list of projects, and from there request their associated repositories individually. Everything is paginated, code uses throttling at the individual GET call level.""" LISTER_NAME = "tuleap" REPO_LIST_PATH = "/api" REPO_GIT_PATH = "plugins/git/" REPO_SVN_PATH = "plugins/svn/" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response @classmethod def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage: if repo_type == "git": prefix_url = TuleapLister.REPO_GIT_PATH else: prefix_url = TuleapLister.REPO_SVN_PATH rep = { "project": repo["name"], "type": repo_type, "uri": urljoin(url, f"{prefix_url}{repo['path']}"), "last_update_date": repo["last_update_date"], } return rep def _get_repositories(self, url_repo) -> List[Dict[str, Any]]: ret = self.page_request(url_repo, {}) reps_list = ret.json()["repositories"] limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) offset = int(ret.headers["X-PAGINATION-LIMIT"]) size = int(ret.headers["X-PAGINATION-SIZE"]) while offset < size: url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit) ret = self.page_request(url_offset, {}).json() reps_list = reps_list + ret["repositories"] offset += limit return reps_list def get_pages(self) -> Iterator[RepoPage]: # base with trailing slash, path without leading slash for urljoin url_api: str = urljoin(self.url, self.REPO_LIST_PATH) url_projects = url_api + "/projects/" # Get the list of projects. response = self.page_request(url_projects, {}) projects_list = response.json() limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) offset = int(response.headers["X-PAGINATION-LIMIT"]) size = int(response.headers["X-PAGINATION-SIZE"]) while offset < size: url_offset = ( url_projects + "?offset=" + str(offset) + "&limit=" + str(limit) ) ret = self.page_request(url_offset, {}).json() projects_list = projects_list + ret offset += limit # Get list of repositories for each project. for p in projects_list: p_id = p["id"] # Fetch Git repositories for project url_git = url_projects + str(p_id) + "/git" repos = self._get_repositories(url_git) for repo in repos: yield self.results_simplified(url_api, "git", repo) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Tuleap repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None yield ListedOrigin( lister_id=self.lister_obj.id, url=page["uri"], visit_type=page["type"], last_update=iso8601.parse_date(page["last_update_date"]), ) diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py index 16d0c7a..d650dc8 100644 --- a/swh/lister/tuleap/tests/test_lister.py +++ b/swh/lister/tuleap/tests/test_lister.py @@ -1,165 +1,170 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import Dict, List, Tuple import pytest import requests from swh.lister.tuleap.lister import RepoPage, TuleapLister from swh.scheduler.model import ListedOrigin TULEAP_URL = "https://tuleap.net/" TULEAP_PROJECTS_URL = TULEAP_URL + "api/projects/" TULEAP_REPO_1_URL = TULEAP_URL + "api/projects/685/git" # manjaromemodoc TULEAP_REPO_2_URL = TULEAP_URL + "api/projects/309/git" # myaurora TULEAP_REPO_3_URL = TULEAP_URL + "api/projects/1080/git" # tuleap cleanup module GIT_REPOS = ( "https://tuleap.net/plugins/git/manjaromemodoc/manjaro-memo-documentation.git", "https://tuleap.net/plugins/git/myaurora/myaurora.git", ) @pytest.fixture def tuleap_projects(datadir) -> Tuple[str, Dict[str, str], List[str]]: text = Path(datadir, "https_tuleap.net", "projects").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "2", } repo_json = json.loads(text) projects = [p["shortname"] for p in repo_json] return text, headers, projects @pytest.fixture def tuleap_repo_1(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: text = Path(datadir, "https_tuleap.net", "repo_1").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "1", } reps = json.loads(text) page_results = [] for r in reps["repositories"]: page_results.append( TuleapLister.results_simplified(url=TULEAP_URL, repo_type="git", repo=r) ) origin_urls = [r["uri"] for r in page_results] return text, headers, page_results, origin_urls @pytest.fixture def tuleap_repo_2(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: text = Path(datadir, "https_tuleap.net", "repo_2").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "1", } reps = json.loads(text) page_results = [] for r in reps["repositories"]: page_results.append( TuleapLister.results_simplified(url=TULEAP_URL, repo_type="git", repo=r) ) origin_urls = [r["uri"] for r in page_results] return text, headers, page_results, origin_urls @pytest.fixture def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: text = Path(datadir, "https_tuleap.net", "repo_3").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "0", } reps = json.loads(text) page_results = [] for r in reps["repositories"]: page_results.append( TuleapLister.results_simplified(url=TULEAP_URL, repo_type="git", repo=r) ) origin_urls = [r["uri"] for r in page_results] return text, headers, page_results, origin_urls +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(TuleapLister.page_request.retry, "sleep") + + def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. Does not test last_update.""" assert set(lister_urls) == {origin.url for origin in scheduler_origins} def test_tuleap_full_listing( swh_scheduler, requests_mock, mocker, tuleap_projects, tuleap_repo_1, tuleap_repo_2, tuleap_repo_3, ): """Covers full listing of multiple pages, rate-limit, page size (required for test), checking page results and listed origins, statelessness.""" lister = TuleapLister( scheduler=swh_scheduler, url=TULEAP_URL, instance="tuleap.net" ) p_text, p_headers, p_projects = tuleap_projects r1_text, r1_headers, r1_result, r1_origin_urls = tuleap_repo_1 r2_text, r2_headers, r2_result, r2_origin_urls = tuleap_repo_2 r3_text, r3_headers, r3_result, r3_origin_urls = tuleap_repo_3 requests_mock.get(TULEAP_PROJECTS_URL, text=p_text, headers=p_headers) requests_mock.get(TULEAP_REPO_1_URL, text=r1_text, headers=r1_headers) requests_mock.get( TULEAP_REPO_2_URL, [ {"status_code": requests.codes.too_many_requests}, {"text": r2_text, "headers": r2_headers}, ], ) requests_mock.get(TULEAP_REPO_3_URL, text=r3_text, headers=r3_headers) # end test setup stats = lister.run() # start test checks assert stats.pages == 2 assert stats.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( r1_origin_urls + r2_origin_urls + r3_origin_urls, scheduler_origins ) check_listed_origins(GIT_REPOS, scheduler_origins) assert lister.get_state_from_scheduler() is None @pytest.mark.parametrize("http_code", [400, 500, 502]) def test_tuleap_list_http_error(swh_scheduler, requests_mock, http_code): """Test handling of some HTTP errors commonly encountered""" lister = TuleapLister(scheduler=swh_scheduler, url=TULEAP_URL) requests_mock.get(TULEAP_PROJECTS_URL, status_code=http_code) with pytest.raises(requests.HTTPError): lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 diff --git a/swh/lister/utils.py b/swh/lister/utils.py index ea4a989..125b31b 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,121 +1,113 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Callable, Iterator, Tuple from requests.exceptions import ConnectionError, HTTPError from requests.status_codes import codes from tenacity import retry as tenacity_retry from tenacity.stop import stop_after_attempt from tenacity.wait import wait_exponential def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]: """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can have one more element. >>> list(split_range(19, 10)) [(0, 9), (10, 19)] >>> list(split_range(20, 3)) [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)] >>> list(split_range(21, 3)) [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21)] """ prev_index = None for index in range(0, total_pages, nb_pages): if index is not None and prev_index is not None: yield prev_index, index - 1 prev_index = index if index != total_pages: yield index, total_pages def is_throttling_exception(e: Exception) -> bool: """ Checks if an exception is a requests.exception.HTTPError for a response with status code 429 (too many requests). """ return ( isinstance(e, HTTPError) and e.response.status_code == codes.too_many_requests ) def is_retryable_exception(e: Exception) -> bool: """ Checks if an exception is worth retrying (connection, throttling or a server error). """ is_connection_error = isinstance(e, ConnectionError) is_500_error = isinstance(e, HTTPError) and e.response.status_code >= 500 return is_connection_error or is_throttling_exception(e) or is_500_error def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> bool: """ Custom tenacity retry predicate for handling exceptions with the given predicate. """ attempt = retry_state.outcome if attempt.failed: exception = attempt.exception() return predicate(exception) return False -def retry_if_throttling(retry_state) -> bool: - """ - Custom tenacity retry predicate for handling HTTP responses with - status code 429 (too many requests). - """ - return retry_if_exception(retry_state, is_throttling_exception) - - def retry_policy_generic(retry_state) -> bool: """ Custom tenacity retry predicate for handling failed requests: - ConnectionError - Server errors (status >= 500) - Throttling errors (status == 429) This does not handle 404, 403 or other status codes. """ return retry_if_exception(retry_state, is_retryable_exception) WAIT_EXP_BASE = 10 MAX_NUMBER_ATTEMPTS = 5 -def throttling_retry( - retry=retry_if_throttling, +def http_retry( + retry=retry_policy_generic, wait=wait_exponential(exp_base=WAIT_EXP_BASE), stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS), **retry_args, ): """ Decorator based on `tenacity` for retrying a function possibly raising requests.exception.HTTPError for status code 429 (too many requests). It provides a default configuration that should work properly in most cases but all `tenacity.retry` parameters can also be overridden in client code. When the mmaximum of attempts is reached, the HTTPError exception will then be reraised. Args: retry: function defining request retry condition (default to 429 status code) https://tenacity.readthedocs.io/en/latest/#whether-to-retry wait: function defining wait strategy before retrying (default to exponential backoff) https://tenacity.readthedocs.io/en/latest/#waiting-before-retrying stop: function defining when to stop retrying (default after 5 attempts) https://tenacity.readthedocs.io/en/latest/#stopping """ return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args)