diff --git a/PKG-INFO b/PKG-INFO index 6687f51..4c7c008 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,127 +1,123 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.9.2 +Version: 2.9.3 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. - - diff --git a/conftest.py b/conftest.py index da8b930..00eb31a 100644 --- a/conftest.py +++ b/conftest.py @@ -1,10 +1,10 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -pytest_plugins = ["swh.scheduler.pytest_plugin"] +pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"] os.environ["LC_ALL"] = "C.UTF-8" diff --git a/requirements-swh.txt b/requirements-swh.txt index b451cdb..3281b3e 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db] >= 0.9 +swh.core[db,github] >= 2.8 swh.scheduler >= 0.8 diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 6687f51..4c7c008 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,127 +1,123 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.9.2 +Version: 2.9.3 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. - - diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt index 6caae43..5e69dc1 100644 --- a/swh.lister.egg-info/requires.txt +++ b/swh.lister.egg-info/requires.txt @@ -1,19 +1,19 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity>=6.2 xmltodict lxml -swh.core[db]>=0.9 +swh.core[db,github]>=2.8 swh.scheduler>=0.8 [testing] pytest pytest-mock requests_mock types-click types-pyyaml types-requests diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 2655744..acef224 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,208 +1,208 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import datetime import logging from typing import Any, Dict, Iterator, List, Optional, Set from urllib.parse import parse_qs, urlparse import iso8601 +from swh.core.github.utils import GitHubSession, MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister -from .utils import GitHubSession, MissingRateLimitReset logger = logging.getLogger(__name__) @dataclass class GitHubListerState: """State of the GitHub lister""" last_seen_id: int = 0 """Numeric id of the last repository listed on an incremental pass""" class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): """List origins from GitHub. By default, the lister runs in incremental mode: it lists all repositories, starting with the `last_seen_id` stored in the scheduler backend. Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in that mode, the lister finds the origins present in the range **excluding** `first_id` and **including** `last_id`. In this mode, the lister can overrun the `last_id`: it will always record all the origins seen in a given page. As the lister is fully idempotent, this is not a practical problem. Once relisting completes, the lister state in the scheduler backend is not updated. When the config contains a set of credentials, we shuffle this list at the beginning of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the same token over and over again, until its rate limit runs out. Once that happens, we switch to the next token over in our shuffled list. When a request fails with a rate limit exception for all tokens, we pause the listing until the largest value for X-Ratelimit-Reset over all tokens. When the credentials aren't set in the lister config, the lister can run in anonymous mode too (e.g. for testing purposes). .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits Args: first_id: the id of the first repo to list last_id: stop listing after seeing a repo with an id higher than this value. """ # noqa: B950 LISTER_NAME = "github" API_URL = "https://api.github.com/repositories" PAGE_SIZE = 1000 def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, first_id: Optional[int] = None, last_id: Optional[int] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance="github", ) self.first_id = first_id self.last_id = last_id self.relisting = self.first_id is not None or self.last_id is not None self.github_session = GitHubSession( credentials=self.credentials, user_agent=USER_AGENT ) def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: return GitHubListerState(**d) def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]: return asdict(state) def get_pages(self) -> Iterator[List[Dict[str, Any]]]: current_id = 0 if self.first_id is not None: current_id = self.first_id elif self.state is not None: current_id = self.state.last_seen_id current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}" while self.last_id is None or current_id < self.last_id: logger.debug("Getting page %s", current_url) try: response = self.github_session.request(current_url) except MissingRateLimitReset: # Give up break # We've successfully retrieved a (non-ratelimited) `response`. We # still need to check it for validity. if response.status_code != 200: logger.warning( "Got unexpected status_code %s: %s", response.status_code, response.content, ) break yield response.json() if "next" not in response.links: # No `next` link, we've reached the end of the world logger.debug( "No next link found in the response headers, all caught up" ) break # GitHub strongly advises to use the next link directly. We still # parse it to get the id of the last repository we've reached so # far. next_url = response.links["next"]["url"] parsed_url = urlparse(next_url) if not parsed_url.query: logger.warning("Failed to parse url %s", next_url) break parsed_query = parse_qs(parsed_url.query) current_id = int(parsed_query["since"][0]) current_url = next_url def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of GitHub repositories into a list of ListedOrigins. This records the html_url, as well as the pushed_at value if it exists. """ assert self.lister_obj.id is not None seen_in_page: Set[str] = set() for repo in page: if not repo: # null repositories in listings happen sometimes... continue if repo["html_url"] in seen_in_page: continue seen_in_page.add(repo["html_url"]) pushed_at_str = repo.get("pushed_at") pushed_at: Optional[datetime.datetime] = None if pushed_at_str: pushed_at = iso8601.parse_date(pushed_at_str) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["html_url"], visit_type="git", last_update=pushed_at, ) def commit_page(self, page: List[Dict[str, Any]]): """Update the currently stored state using the latest listed page""" if self.relisting: # Don't update internal state when relisting return if not page: # Sometimes, when you reach the end of the world, GitHub returns an empty # page of repositories return last_id = page[-1]["id"] if last_id > self.state.last_seen_id: self.state.last_seen_id = last_id def finalize(self): if self.relisting: return # Pull fresh lister state from the scheduler backend scheduler_state = self.get_state_from_scheduler() # Update the lister state in the backend only if the last seen id of # the current run is higher than that stored in the database. if self.state.last_seen_id > scheduler_state.last_seen_id: self.updated = True diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py index 2c874ae..88c5bf4 100644 --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -1,418 +1,245 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging -import time -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List import pytest import requests_mock +from swh.core.github.pytest_plugin import github_response_callback from swh.lister.github.lister import GitHubLister from swh.lister.pattern import CredentialsType, ListerStats from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import Lister NUM_PAGES = 10 ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES -def github_repo(i: int) -> Dict[str, Union[int, str]]: - """Basic repository information returned by the GitHub API""" - - repo: Dict[str, Union[int, str]] = { - "id": i, - "html_url": f"https://github.com/origin/{i}", - } - - # Set the pushed_at date on one of the origins - if i == 4321: - repo["pushed_at"] = "2018-11-08T13:16:24Z" - - return repo - - -def github_response_callback( - request: requests_mock.request._RequestObjectProxy, - context: requests_mock.response._Context, -) -> List[Dict[str, Union[str, int]]]: - """Return minimal GitHub API responses for the common case where the loader - hasn't been rate-limited""" - # Check request headers - assert request.headers["Accept"] == "application/vnd.github.v3+json" - assert "Software Heritage Lister" in request.headers["User-Agent"] - - # Check request parameters: per_page == 1000, since = last_repo_id - assert "per_page" in request.qs - assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)] - assert "since" in request.qs - - since = int(request.qs["since"][0]) - - next_page = since + GitHubLister.PAGE_SIZE - if next_page < ORIGIN_COUNT: - # the first id for the next page is within our origin count; add a Link - # header to the response - next_url = ( - GitHubLister.API_URL - + f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}" - ) - context.headers["Link"] = f"<{next_url}>; rel=next" - - return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)] - - @pytest.fixture() def requests_mocker() -> Iterator[requests_mock.Mocker]: with requests_mock.Mocker() as mock: mock.get(GitHubLister.API_URL, json=github_response_callback) yield mock def get_lister_data(swh_scheduler: SchedulerInterface) -> Lister: """Retrieve the data for the GitHub Lister""" return swh_scheduler.get_or_create_lister(name="github", instance_name="github") def set_lister_state(swh_scheduler: SchedulerInterface, state: Dict[str, Any]) -> None: """Set the state of the lister in database""" lister = swh_scheduler.get_or_create_lister(name="github", instance_name="github") lister.current_state = state swh_scheduler.update_lister(lister) def check_origin_4321(swh_scheduler: SchedulerInterface, lister: Lister) -> None: """Check that origin 4321 exists and has the proper last_update timestamp""" origin_4321_req = swh_scheduler.get_listed_origins( url="https://github.com/origin/4321" ) assert len(origin_4321_req.results) == 1 origin_4321 = origin_4321_req.results[0] assert origin_4321.lister_id == lister.id assert origin_4321.visit_type == "git" assert origin_4321.last_update == datetime.datetime( 2018, 11, 8, 13, 16, 24, tzinfo=datetime.timezone.utc ) def check_origin_5555(swh_scheduler: SchedulerInterface, lister: Lister) -> None: """Check that origin 5555 exists and has no last_update timestamp""" origin_5555_req = swh_scheduler.get_listed_origins( url="https://github.com/origin/5555" ) assert len(origin_5555_req.results) == 1 origin_5555 = origin_5555_req.results[0] assert origin_5555.lister_id == lister.id assert origin_5555.visit_type == "git" assert origin_5555.last_update is None def test_from_empty_state( swh_scheduler, caplog, requests_mocker: requests_mock.Mocker ) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Run the lister in incremental mode lister = GitHubLister(scheduler=swh_scheduler) res = lister.run() assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) listed_origins = swh_scheduler.get_listed_origins(limit=ORIGIN_COUNT + 1) assert len(listed_origins.results) == ORIGIN_COUNT assert listed_origins.next_page_token is None lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT} check_origin_4321(swh_scheduler, lister_data) check_origin_5555(swh_scheduler, lister_data) def test_incremental(swh_scheduler, caplog, requests_mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Number of origins to skip skip_origins = 2000 expected_origins = ORIGIN_COUNT - skip_origins # Bump the last_seen_id in the scheduler backend set_lister_state(swh_scheduler, {"last_seen_id": skip_origins}) # Run the lister in incremental mode lister = GitHubLister(scheduler=swh_scheduler) res = lister.run() # add 1 page to the number of full_pages if partial_page_len is not 0 full_pages, partial_page_len = divmod(expected_origins, GitHubLister.PAGE_SIZE) expected_pages = full_pages + bool(partial_page_len) assert res == ListerStats(pages=expected_pages, origins=expected_origins) listed_origins = swh_scheduler.get_listed_origins(limit=expected_origins + 1) assert len(listed_origins.results) == expected_origins assert listed_origins.next_page_token is None lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT} check_origin_4321(swh_scheduler, lister_data) check_origin_5555(swh_scheduler, lister_data) def test_relister(swh_scheduler, caplog, requests_mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Only set this state as a canary: in the currently tested mode, the lister # should not be touching it. set_lister_state(swh_scheduler, {"last_seen_id": 123}) # Use "relisting" mode to list origins between id 10 and 1011 lister = GitHubLister(scheduler=swh_scheduler, first_id=10, last_id=1011) res = lister.run() # Make sure we got two full pages of results assert res == ListerStats(pages=2, origins=2000) # Check that the relisting mode hasn't touched the stored state. lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": 123} -def github_ratelimit_callback( - request: requests_mock.request._RequestObjectProxy, - context: requests_mock.response._Context, - ratelimit_reset: Optional[int], -) -> Dict[str, str]: - """Return a rate-limited GitHub API response.""" - # Check request headers - assert request.headers["Accept"] == "application/vnd.github.v3+json" - assert "Software Heritage Lister" in request.headers["User-Agent"] - if "Authorization" in request.headers: - context.status_code = 429 - else: - context.status_code = 403 - - if ratelimit_reset is not None: - context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset) - - return { - "message": "API rate limit exceeded for .", - "documentation_url": "https://developer.github.com/v3/#rate-limiting", - } - - -@pytest.fixture() -def num_before_ratelimit() -> int: - """Number of successful requests before the ratelimit hits""" - return 0 - - -@pytest.fixture() -def num_ratelimit() -> Optional[int]: - """Number of rate-limited requests; None means infinity""" - return None - - -@pytest.fixture() -def ratelimit_reset() -> Optional[int]: - """Value of the X-Ratelimit-Reset header on ratelimited responses""" - return None - - -@pytest.fixture() -def requests_ratelimited( - num_before_ratelimit: int, - num_ratelimit: Optional[int], - ratelimit_reset: Optional[int], -) -> Iterator[requests_mock.Mocker]: - """Mock requests to the GitHub API, returning a rate-limiting status code - after `num_before_ratelimit` requests. - - GitHub does inconsistent rate-limiting: - - Anonymous requests return a 403 status code - - Authenticated requests return a 429 status code, with an - X-Ratelimit-Reset header. - - This fixture takes multiple arguments (which can be overridden with a - :func:`pytest.mark.parametrize` parameter): - - num_before_ratelimit: the global number of requests until the - ratelimit triggers - - num_ratelimit: the number of requests that return a - rate-limited response. - - ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the - request is authenticated. - - The default values set in the previous fixtures make all requests return a rate - limit response. - """ - current_request = 0 - - def response_callback(request, context): - nonlocal current_request - current_request += 1 - if num_before_ratelimit < current_request and ( - num_ratelimit is None - or current_request < num_before_ratelimit + num_ratelimit + 1 - ): - return github_ratelimit_callback(request, context, ratelimit_reset) - else: - return github_response_callback(request, context) - - with requests_mock.Mocker() as mock: - mock.get(GitHubLister.API_URL, json=response_callback) - yield mock - - def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None: - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler) assert lister.github_session.anonymous assert "using anonymous mode" in caplog.records[-1].message caplog.clear() res = lister.run() assert res == ListerStats(pages=0, origins=0) last_log = caplog.records[-1] assert last_log.levelname == "WARNING" assert "No X-Ratelimit-Reset value found in responses" in last_log.message -@pytest.fixture -def github_credentials() -> List[Dict[str, str]]: - """Return a static list of GitHub credentials""" - return sorted( - [{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)] - + [ - {"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"} - for i in range(3) - ], - key=lambda c: c["username"], - ) - - -@pytest.fixture -def all_tokens(github_credentials) -> List[str]: - """Return the list of tokens matching the static credential""" - - return [t.get("token", t.get("password")) for t in github_credentials] - - @pytest.fixture def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType: """Return the credentials formatted for use by the lister""" return {"github": {"github": github_credentials}} def test_authenticated_credentials( swh_scheduler, caplog, github_credentials, lister_credentials, all_tokens ): """Test credentials management when the lister is authenticated""" caplog.set_level(logging.DEBUG, "swh.lister.github.lister") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) assert lister.github_session.token_index == 0 assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials assert lister.github_session.session.headers["Authorization"] in [ "token %s" % t for t in all_tokens ] -def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None): - """Record calls to time.sleep in the sleep_calls list""" - if duration < 0: - raise ValueError("Can't sleep for a negative amount of time!") - if sleep_calls is not None: - sleep_calls.append(duration) - - -def fake_time_time(): - """Return 0 when running time.time()""" - return 0 - - -@pytest.fixture -def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]: - """Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments - passed to time.sleep().""" - sleeps: List[float] = [] - monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps)) - monkeypatch.setattr(time, "time", fake_time_time) - yield sleeps - - @pytest.mark.parametrize( "num_ratelimit", [1] ) # return a single rate-limit response, then continue def test_ratelimit_once_recovery( swh_scheduler, caplog, requests_ratelimited, num_ratelimit, monkeypatch_sleep_calls, lister_credentials, ): """Check that the lister recovers from hitting the rate-limit once""" - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) res = lister.run() # check that we used all the pages assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) token_users = [] for record in caplog.records: if "Using authentication token" in record.message: token_users.append(record.args[0]) # check that we used one more token than we saw rate limited requests assert len(token_users) == 1 + num_ratelimit # check that we slept for one second between our token uses assert monkeypatch_sleep_calls == [1] @pytest.mark.parametrize( # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a # set value for X-Ratelimit-Reset, then resume listing successfully. "num_before_ratelimit, num_ratelimit, ratelimit_reset", [(5, 6, 123456)], ) def test_ratelimit_reset_sleep( swh_scheduler, caplog, requests_ratelimited, monkeypatch_sleep_calls, num_before_ratelimit, ratelimit_reset, github_credentials, lister_credentials, ): """Check that the lister properly handles rate-limiting when providing it with authentication tokens""" - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) res = lister.run() assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) # We sleep 1 second every time we change credentials, then we sleep until # ratelimit_reset + 1 expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1] assert monkeypatch_sleep_calls == expected_sleep_calls found_exhaustion_message = False for record in caplog.records: if record.levelname == "INFO": if "Rate limits exhausted for all tokens" in record.message: found_exhaustion_message = True break assert found_exhaustion_message diff --git a/swh/lister/github/utils.py b/swh/lister/github/utils.py index 269d432..df6088e 100644 --- a/swh/lister/github/utils.py +++ b/swh/lister/github/utils.py @@ -1,170 +1,6 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging -import random -import time -from typing import Dict, List, Optional - -import requests -from tenacity import ( - retry, - retry_any, - retry_if_exception_type, - retry_if_result, - wait_exponential, -) - -logger = logging.getLogger(__name__) - - -class RateLimited(Exception): - def __init__(self, response): - self.reset_time: Optional[int] - - # Figure out how long we need to sleep because of that rate limit - ratelimit_reset = response.headers.get("X-Ratelimit-Reset") - retry_after = response.headers.get("Retry-After") - if ratelimit_reset is not None: - self.reset_time = int(ratelimit_reset) - elif retry_after is not None: - self.reset_time = int(time.time()) + int(retry_after) + 1 - else: - logger.warning( - "Received a rate-limit-like status code %s, but no rate-limit " - "headers set. Response content: %s", - response.status_code, - response.content, - ) - self.reset_time = None - self.response = response - - -class MissingRateLimitReset(Exception): - pass - - -class GitHubSession: - """Manages a :class:`requests.Session` with (optionally) multiple credentials, - and cycles through them when reaching rate-limits.""" - - def __init__( - self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None - ) -> None: - """Initialize a requests session with the proper headers for requests to - GitHub.""" - self.credentials = credentials - if self.credentials: - random.shuffle(self.credentials) - - self.session = requests.Session() - - self.session.headers.update( - {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent} - ) - - self.anonymous = not self.credentials - - if self.anonymous: - logger.warning("No tokens set in configuration, using anonymous mode") - - self.token_index = -1 - self.current_user: Optional[str] = None - - if not self.anonymous: - # Initialize the first token value in the session headers - self.set_next_session_token() - - def set_next_session_token(self) -> None: - """Update the current authentication token with the next one in line.""" - - assert self.credentials - - self.token_index = (self.token_index + 1) % len(self.credentials) - - auth = self.credentials[self.token_index] - - self.current_user = auth["username"] - logger.debug("Using authentication token for user %s", self.current_user) - - if "password" in auth: - token = auth["password"] - else: - token = auth["token"] - - self.session.headers.update({"Authorization": f"token {token}"}) - - @retry( - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_any( - # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g. - # when running the lister on a connection with high latency - retry_if_exception_type(requests.exceptions.ChunkedEncodingError), - # 502 status codes happen for a Server Error, sometimes - retry_if_result(lambda r: r.status_code == 502), - ), - ) - def _request(self, url: str) -> requests.Response: - response = self.session.get(url) - - if ( - # GitHub returns inconsistent status codes between unauthenticated - # rate limit and authenticated rate limits. Handle both. - response.status_code == 429 - or (self.anonymous and response.status_code == 403) - ): - raise RateLimited(response) - - return response - - def request(self, url) -> requests.Response: - """Repeatedly requests the given URL, cycling through credentials and sleeping - if necessary; until either a successful response or :exc:`MissingRateLimitReset` - """ - # The following for/else loop handles rate limiting; if successful, - # it provides the rest of the function with a `response` object. - # - # If all tokens are rate-limited, we sleep until the reset time, - # then `continue` into another iteration of the outer while loop, - # attempting to get data from the same URL again. - - while True: - max_attempts = len(self.credentials) if self.credentials else 1 - reset_times: Dict[int, int] = {} # token index -> time - for attempt in range(max_attempts): - try: - return self._request(url) - except RateLimited as e: - reset_info = "(unknown reset)" - if e.reset_time is not None: - reset_times[self.token_index] = e.reset_time - reset_info = "(resetting in %ss)" % (e.reset_time - time.time()) - - if not self.anonymous: - logger.info( - "Rate limit exhausted for current user %s %s", - self.current_user, - reset_info, - ) - # Use next token in line - self.set_next_session_token() - # Wait one second to avoid triggering GitHub's abuse rate limits - time.sleep(1) - - # All tokens have been rate-limited. What do we do? - - if not reset_times: - logger.warning( - "No X-Ratelimit-Reset value found in responses for any token; " - "Giving up." - ) - raise MissingRateLimitReset() - - sleep_time = max(reset_times.values()) - time.time() + 1 - logger.info( - "Rate limits exhausted for all tokens. Sleeping for %f seconds.", - sleep_time, - ) - time.sleep(sleep_time) +from swh.core.github.utils import * # noqa diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 4ccc532..2dc6cc5 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,390 +1,425 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin import requests from tenacity.before_sleep import before_sleep_log import xmltodict +from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] +SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") + @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) self.jar_origins: Dict[str, ListedOrigin] = {} + self.github_session = GitHubSession( + credentials=self.credentials, user_agent=USER_AGENT + ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None response = requests.get(self.INDEX_URL, stream=True) if response.status_code != 200: logger.error("Index %s not found, stopping", self.INDEX_URL) response.raise_for_status() # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = doc_id else: m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin( self.BASE_URL, url_path, ) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom in out_pom: try: response = self.page_request(pom, {}) project = xmltodict.parse(response.content) project_d = project.get("project", {}) scm_d = project_d.get("scm") if scm_d is not None: connection = scm_d.get("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], "url": connection, } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", pom, ) except xmltodict.expat.ExpatError as error: logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: + """Retrieve scm origin out of the page information. Only called when type of the + page is scm. + + Try and detect an scm/vcs repository. Note that official format is in the form: + scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put + the repo url (without the "scm:type"), so we have to check against the content + to extract the type and url properly. + + Raises + AssertionError when the type of the page is not 'scm' + + Returns + ListedOrigin with proper canonical scm url (for github) if any is found, + None otherwise. + + """ + + assert page["type"] == "scm" + visit_type: Optional[str] = None + url: Optional[str] = None + m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) + if m_scm is None: + return None + + scm_type = m_scm.group("type") + if scm_type and scm_type in SUPPORTED_SCM_TYPES: + url = m_scm.group("url") + visit_type = scm_type + elif page["url"].endswith(".git"): + url = page["url"].lstrip("scm:") + visit_type = "git" + else: + return None + + if url and visit_type == "git": + # Non-github urls will be returned as is, github ones will be canonical ones + url = self.github_session.get_canonical_url(url) + + if not url: + return None + + assert visit_type is not None + assert self.lister_obj.id is not None + return ListedOrigin( + lister_id=self.lister_obj.id, + url=url, + visit_type=visit_type, + ) + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins.""" - assert self.lister_obj.id is not None - scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": - # If origin is a scm url: detect scm type and yield. - # Note that the official format is: - # scm:git:git://github.com/openengsb/openengsb-framework.git - # but many, many projects directly put the repo url, so we have to - # detect the content to match it properly. - m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) - if m_scm is not None: - scm_type = m_scm.group("type") - if scm_type in scm_types_ok: - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=scm_url, - visit_type=scm_type, - ) - yield origin - else: - if page["url"].endswith(".git"): - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=page["url"], - visit_type="git", - ) - yield origin + listed_origin = self.get_scm(page) + if listed_origin: + yield listed_origin else: # Origin is gathering source archives: last_update_dt = None last_update_iso = "" try: last_update_seconds = str(page["time"])[:-3] last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) last_update_dt = last_update_dt.astimezone(timezone.utc) except (OverflowError, ValueError): logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: last_update_iso = last_update_dt.isoformat() # Origin URL will target page holding sources for all versions of # an artifactId (package name) inside a groupId (namespace) path = "/".join(page["gid"].split(".")) origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") artifact = { **{k: v for k, v in page.items() if k != "doc"}, "time": last_update_iso, "base_url": self.BASE_URL, } if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet + assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=page["type"], last_update=last_update_dt, extra_loader_arguments={"artifacts": [artifact]}, ) self.jar_origins[origin_url] = jar_origin else: # Update list of source artifacts for that origin otherwise jar_origin = self.jar_origins[origin_url] artifacts = jar_origin.extra_loader_arguments["artifacts"] if artifact not in artifacts: artifacts.append(artifact) if ( jar_origin.last_update and last_update_dt and last_update_dt > jar_origin.last_update ): jar_origin.last_update = last_update_dt if not self.incremental or ( self.state and page["doc"] > self.state.last_seen_doc ): # Yield origin with updated source artifacts, multiple instances of # ListedOrigin for the same origin URL but with different artifacts # list will be sent to the scheduler but it will deduplicate them and # take the latest one to upsert in database yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 331461e..b2a88f9 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,334 +1,353 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" -LIST_GIT = ( - "git://github.com/aldialimucaj/sprova4j.git", - "https://github.com/aldialimucaj/sprova4j.git", -) -LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) +USER_REPO0 = "aldialimucaj/sprova4j" +GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" +GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" +LIST_GIT = (GIT_REPO_URL0_HTTPS,) + +USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" +GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" +GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" +GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" +LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() @pytest.fixture def maven_pom_1(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() @pytest.fixture def maven_pom_1_malformed(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes() @pytest.fixture def maven_pom_2(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() @pytest.fixture def maven_pom_3(datadir) -> bytes: return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() +@pytest.fixture +def requests_mock(requests_mock): + """If github api calls for the configured scm repository, returns its canonical url.""" + for url_api, url_html in [ + (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), + (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), + ]: + requests_mock.get( + url_api, + json={"html_url": url_html}, + ) + yield requests_mock + + @pytest.fixture(autouse=True) def network_requests_mock( requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): requests_mock.get(INDEX_URL, content=maven_index_full) requests_mock.get(URL_POM_1, content=maven_pom_1) requests_mock.get(URL_POM_2, content=maven_pom_2) requests_mock.get(URL_POM_3, content=maven_pom_3) def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) - assert len(origin_urls) == 4 + assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, maven_pom_1_malformed, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. requests_mock.get(URL_POM_1, content=maven_pom_1_malformed) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 - assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC) + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert len(origin_urls) == 2 - assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC) + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 3 + assert len(scheduler_origins) == 2 def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32")) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 3 + assert len(scheduler_origins) == 2