Page MenuHomeSoftware Heritage

D8218.id29705.diff
No OneTemporary

D8218.id29705.diff

diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py
--- a/swh/lister/gogs/lister.py
+++ b/swh/lister/gogs/lister.py
@@ -3,10 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from dataclasses import asdict, dataclass
import logging
import random
from typing import Any, Dict, Iterator, List, Optional
-from urllib.parse import urljoin
+from urllib.parse import parse_qs, urljoin, urlparse
import iso8601
import requests
@@ -17,15 +18,34 @@
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
-# Aliasing page results returned by `GogsLister.get_pages` method
-GogsListerPage = List[Dict[str, Any]]
+Repo = Dict[str, Any]
-class GogsLister(StatelessLister[GogsListerPage]):
+@dataclass
+class GogsListerPage:
+ repos: Optional[List[Repo]] = None
+ next_link: Optional[str] = None
+
+
+@dataclass
+class GogsListerState:
+ last_seen_next_link: Optional[str] = None
+ """Last link header (not visited yet) during an incremental pass."""
+
+
+def _parse_page_id(url: Optional[str]) -> int:
+ """Parse the page id from a Gogs page url."""
+ if url is None:
+ return 0
+
+ return int(parse_qs(urlparse(url).query)["page"][0])
+
+
+class GogsLister(Lister[GogsListerState, GogsListerPage]):
"""List origins from the Gogs
@@ -61,7 +81,6 @@
self.query_params = {
"limit": page_size,
- "page": 1,
}
self.api_token = api_token
@@ -88,6 +107,12 @@
}
)
+ def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState:
+ return GogsListerState(**d)
+
+ def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
+ return asdict(state)
+
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
@@ -107,38 +132,64 @@
return response
@classmethod
- def results_simplified(cls, body: Dict[str, GogsListerPage]) -> GogsListerPage:
+ def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]:
fields_filter = ["id", "clone_url", "updated_at"]
return [{k: r[k] for k in fields_filter} for r in body["data"]]
def get_pages(self) -> Iterator[GogsListerPage]:
- # base with trailing slash, path without leading slash for urljoin
- url = urljoin(self.url, self.REPO_LIST_PATH)
- response = self.page_request(url, self.query_params)
+ page_id = 1
+ if self.state.last_seen_next_link is not None:
+ page_id = _parse_page_id(self.state.last_seen_next_link)
- while True:
- page_results = self.results_simplified(response.json())
+ # base with trailing slash, path without leading slash for urljoin
+ next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH)
+ response = self.page_request(next_link, {**self.query_params, "page": page_id})
- yield page_results
+ while next_link is not None:
+ repos = self.extract_repos(response.json())
assert len(response.links) > 0, "API changed: no Link header found"
if "next" in response.links:
- url = response.links["next"]["url"]
+ next_link = response.links["next"]["url"]
else:
- break
+ next_link = None # Happens for the last page
+
+ yield GogsListerPage(repos=repos, next_link=next_link)
- response = self.page_request(url, {})
+ if next_link is not None:
+ response = self.page_request(next_link, {})
def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of Gogs repositories into a list of ListedOrigins"""
assert self.lister_obj.id is not None
+ assert page.repos is not None
- for repo in page:
- last_update = iso8601.parse_date(repo["updated_at"])
+ for r in page.repos:
+ last_update = iso8601.parse_date(r["updated_at"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
- url=repo["clone_url"],
+ url=r["clone_url"],
last_update=last_update,
)
+
+ def commit_page(self, page: GogsListerPage) -> None:
+ last_seen_next_link = page.next_link # None for last page
+
+ page_id = _parse_page_id(last_seen_next_link)
+ state_page_id = _parse_page_id(self.state.last_seen_next_link)
+
+ if (last_seen_next_link is None) or (page_id > state_page_id):
+ self.state.last_seen_next_link = last_seen_next_link
+
+ def finalize(self) -> None:
+ scheduler_state = self.get_state_from_scheduler() # None for last page
+
+ state_page_id = _parse_page_id(self.state.last_seen_next_link)
+ scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link)
+
+ if (self.state.last_seen_next_link is None) or (
+ state_page_id > scheduler_page_id
+ ):
+ self.updated = True
diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3
new file mode 100644
--- /dev/null
+++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3
@@ -0,0 +1,168 @@
+{
+ "data": [
+ {
+ "id": 340,
+ "owner": {
+ "id": 585,
+ "username": "zork",
+ "login": "zork",
+ "full_name": "",
+ "email": "f905334@trbvm.com",
+ "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon"
+ },
+ "name": "beyond-the-titanic",
+ "full_name": "zork/beyond-the-titanic",
+ "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?",
+ "private": false,
+ "fork": false,
+ "parent": null,
+ "empty": false,
+ "mirror": false,
+ "size": 1436672,
+ "html_url": "https://try.gogs.io/zork/beyond-the-titanic",
+ "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git",
+ "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git",
+ "website": "",
+ "stars_count": 0,
+ "forks_count": 1,
+ "watchers_count": 1,
+ "open_issues_count": 0,
+ "default_branch": "master",
+ "created_at": "2015-03-03T22:51:12Z",
+ "updated_at": "2022-03-26T07:28:38Z"
+ },
+ {
+ "id": 350,
+ "owner": {
+ "id": 599,
+ "username": "perekre",
+ "login": "perekre",
+ "full_name": "",
+ "email": "perekre@nincsmail.com",
+ "avatar_url": "https://secure.gravatar.com/avatar/0e2666adf16f8a958a56141a2d94565c?d=identicon"
+ },
+ "name": "beyond-the-titanic",
+ "full_name": "perekre/beyond-the-titanic",
+ "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?",
+ "private": false,
+ "fork": true,
+ "parent": {
+ "id": 340,
+ "owner": {
+ "id": 585,
+ "username": "zork",
+ "login": "zork",
+ "full_name": "",
+ "email": "f905334@trbvm.com",
+ "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon"
+ },
+ "name": "beyond-the-titanic",
+ "full_name": "zork/beyond-the-titanic",
+ "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?",
+ "private": false,
+ "fork": false,
+ "parent": null,
+ "empty": false,
+ "mirror": false,
+ "size": 1436672,
+ "html_url": "https://try.gogs.io/zork/beyond-the-titanic",
+ "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git",
+ "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git",
+ "website": "",
+ "stars_count": 0,
+ "forks_count": 1,
+ "watchers_count": 1,
+ "open_issues_count": 0,
+ "default_branch": "master",
+ "created_at": "2015-03-03T22:51:12Z",
+ "updated_at": "2022-03-26T07:28:38Z",
+ "permissions": {
+ "admin": false,
+ "push": false,
+ "pull": true
+ }
+ },
+ "empty": false,
+ "mirror": false,
+ "size": 1437696,
+ "html_url": "https://try.gogs.io/perekre/beyond-the-titanic",
+ "ssh_url": "git@try.gogs.io:perekre/beyond-the-titanic.git",
+ "clone_url": "https://try.gogs.io/perekre/beyond-the-titanic.git",
+ "website": "",
+ "stars_count": 0,
+ "forks_count": 0,
+ "watchers_count": 1,
+ "open_issues_count": 0,
+ "default_branch": "master",
+ "created_at": "2015-03-04T10:40:46Z",
+ "updated_at": "2022-03-26T07:28:38Z"
+ },
+ {
+ "id": 369,
+ "owner": {
+ "id": 108,
+ "username": "yinheli",
+ "login": "yinheli",
+ "full_name": "",
+ "email": "me@yinheli.com",
+ "avatar_url": "https://secure.gravatar.com/avatar/dedb067ecae8155b87428ac7920dd0ae?d=identicon"
+ },
+ "name": "digits",
+ "full_name": "yinheli/digits",
+ "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.",
+ "private": false,
+ "fork": true,
+ "parent": {
+ "id": 339,
+ "owner": {
+ "id": 585,
+ "username": "zork",
+ "login": "zork",
+ "full_name": "",
+ "email": "f905334@trbvm.com",
+ "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon"
+ },
+ "name": "digits",
+ "full_name": "zork/digits",
+ "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.",
+ "private": false,
+ "fork": false,
+ "parent": null,
+ "empty": false,
+ "mirror": false,
+ "size": 18432,
+ "html_url": "https://try.gogs.io/zork/digits",
+ "ssh_url": "git@try.gogs.io:zork/digits.git",
+ "clone_url": "https://try.gogs.io/zork/digits.git",
+ "website": "",
+ "stars_count": 0,
+ "forks_count": 1,
+ "watchers_count": 1,
+ "open_issues_count": 0,
+ "default_branch": "master",
+ "created_at": "2015-03-03T22:47:56Z",
+ "updated_at": "2022-03-26T07:28:38Z",
+ "permissions": {
+ "admin": false,
+ "push": false,
+ "pull": true
+ }
+ },
+ "empty": false,
+ "mirror": false,
+ "size": 18432,
+ "html_url": "https://try.gogs.io/yinheli/digits",
+ "ssh_url": "git@try.gogs.io:yinheli/digits.git",
+ "clone_url": "https://try.gogs.io/yinheli/digits.git",
+ "website": "",
+ "stars_count": 0,
+ "forks_count": 0,
+ "watchers_count": 1,
+ "open_issues_count": 0,
+ "default_branch": "master",
+ "created_at": "2015-03-06T01:31:17Z",
+ "updated_at": "2022-03-26T07:28:38Z"
+ }
+ ],
+ "ok": true
+}
diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py
--- a/swh/lister/gogs/tests/test_lister.py
+++ b/swh/lister/gogs/tests/test_lister.py
@@ -11,46 +11,61 @@
import pytest
from requests import HTTPError
-from swh.lister.gogs.lister import GogsLister
+from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id
from swh.scheduler.model import ListedOrigin
TRY_GOGS_URL = "https://try.gogs.io/api/v1/"
def try_gogs_page(n: int):
- return TRY_GOGS_URL + f"repos/search?page={n}&limit=3"
+ return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3"
+
+
+P1 = try_gogs_page(1)
+P2 = try_gogs_page(2)
+P3 = try_gogs_page(3)
@pytest.fixture
def trygogs_p1(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text()
- headers = {
- "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=try_gogs_page(2))
- }
- page_result = GogsLister.results_simplified(json.loads(text))
- origin_urls = [r["clone_url"] for r in page_result]
+ headers = {"Link": '<{p2}>; rel="next",<{p3}>; rel="last"'.format(p2=P2, p3=P3)}
+ page_result = GogsListerPage(
+ repos=GogsLister.extract_repos(json.loads(text)), next_link=P2
+ )
+ origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygogs_p2(datadir):
text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text()
- headers = {
- "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1))
- }
- page_result = GogsLister.results_simplified(json.loads(text))
- origin_urls = [r["clone_url"] for r in page_result]
+ headers = {"Link": '<{p3}>; rel="next",<{p1}>; rel="prev"'.format(p1=P1, p3=P3)}
+ page_result = GogsListerPage(
+ repos=GogsLister.extract_repos(json.loads(text)), next_link=P3
+ )
+ origin_urls = [r["clone_url"] for r in page_result.repos]
+ return text, headers, page_result, origin_urls
+
+
+@pytest.fixture
+def trygogs_p3(datadir):
+ text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text()
+ headers = {"Link": '<{p2}>; rel="prev",<{p1}>; rel="first"'.format(p1=P1, p2=P2)}
+ page_result = GogsListerPage(
+ repos=GogsLister.extract_repos(json.loads(text)), next_link=None
+ )
+ origin_urls = [r["clone_url"] for r in page_result.repos]
return text, headers, page_result, origin_urls
@pytest.fixture
-def trygogs_empty_page():
+def trygogs_empty_p3():
origins_urls = []
- page_result = {"data": [], "ok": True}
- headers = {
- "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=try_gogs_page(1))
- }
- text = json.dumps(page_result)
+ body = {"data": [], "ok": True}
+ headers = {"Link": '<{p2}>; rel="prev",<{p1}>; rel="first"'.format(p1=P1, p2=P2)}
+ page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None)
+ text = json.dumps(body)
return text, headers, page_result, origins_urls
@@ -69,7 +84,7 @@
def test_gogs_full_listing(
- swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_empty_page
+ swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3
):
kwargs = dict(
url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret"
@@ -80,84 +95,161 @@
p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1
p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2
- p3_text, p3_headers, _, _ = trygogs_empty_page
+ p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3
- requests_mock.get(try_gogs_page(1), text=p1_text, headers=p1_headers)
- requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers)
- requests_mock.get(try_gogs_page(3), text=p3_text, headers=p3_headers)
+ requests_mock.get(P1, text=p1_text, headers=p1_headers)
+ requests_mock.get(P2, text=p2_text, headers=p2_headers)
+ requests_mock.get(P3, text=p3_text, headers=p3_headers)
stats = lister.run()
- assert stats.pages == 2
- assert stats.origins == 6
+ assert stats.pages == 3
+ assert stats.origins == 9
- calls = [mocker.call(p1_result), mocker.call(p2_result)]
- lister.get_origins_from_page.assert_has_calls(calls)
+ calls = map(mocker.call, [p1_result, p2_result, p3_result])
+ lister.get_origins_from_page.assert_has_calls(list(calls))
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
+ check_listed_origins(
+ p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
+ )
- assert lister.get_state_from_scheduler() is None
+ assert (
+ lister.get_state_from_scheduler().last_seen_next_link is None
+ ) # None because P3 was the last page (no next link)
def test_gogs_auth_instance(
- swh_scheduler, requests_mock, trygogs_p1, trygogs_empty_page
+ swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_empty_p3
):
"""Covers token authentication, token from credentials,
instance inference from URL."""
api_token = "secret"
- instance = "try.gogs.io"
- creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}}
+ instance = "try_gogs"
- kwargs1 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance)
- lister = GogsLister(scheduler=swh_scheduler, **kwargs1)
+ # Test lister initialization without api_token or credentials:
+ with pytest.raises(ValueError, match="No credentials or API token provided"):
+ kwargs1 = dict(url=TRY_GOGS_URL, instance=instance)
+ GogsLister(scheduler=swh_scheduler, **kwargs1)
- # test API token
- assert "Authorization" in lister.session.headers
+ # Test lister initialization using api_token:
+ kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance)
+ lister = GogsLister(scheduler=swh_scheduler, **kwargs2)
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token
- with pytest.raises(ValueError, match="No credentials or API token provided"):
- kwargs2 = dict(url=TRY_GOGS_URL, instance=instance)
- GogsLister(scheduler=swh_scheduler, **kwargs2)
-
+ # Test lister initialization with credentials and run it:
+ creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}}
kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3)
lister = GogsLister(scheduler=swh_scheduler, **kwargs3)
-
- # test API token from credentials
- assert "Authorization" in lister.session.headers
assert lister.session.headers["Authorization"].lower() == "token %s" % api_token
-
- # test instance inference from URL
- assert lister.instance
- assert "gogs" in lister.instance
+ assert lister.instance == "try_gogs"
# setup requests mocking
p1_text, p1_headers, _, _ = trygogs_p1
- p2_text, p2_headers, _, _ = trygogs_empty_page
+ p2_text, p2_headers, _, _ = trygogs_p2
+ p3_text, p3_headers, _, _ = trygogs_empty_p3
- base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH
- requests_mock.get(base_url, text=p1_text, headers=p1_headers)
- requests_mock.get(try_gogs_page(2), text=p2_text, headers=p2_headers)
- # now check the lister runs without error
- stats = lister.run()
+ requests_mock.get(P1, text=p1_text, headers=p1_headers)
+ requests_mock.get(P2, text=p2_text, headers=p2_headers)
+ requests_mock.get(P3, text=p3_text, headers=p3_headers)
- assert stats.pages == 2
- assert stats.origins == 3
+ # lister should run without any error and extract the origins
+ stats = lister.run()
+ assert stats.pages == 3
+ assert stats.origins == 6
@pytest.mark.parametrize("http_code", [400, 500, 502])
-def test_gogs_list_http_error(swh_scheduler, requests_mock, http_code):
+def test_gogs_list_http_error(
+ swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3
+):
"""Test handling of some HTTP errors commonly encountered"""
lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret")
+ p1_text, p1_headers, _, p1_origin_urls = trygogs_p1
+ p3_text, p3_headers, _, _ = trygogs_p3
+
base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH
- requests_mock.get(base_url, status_code=http_code)
+ requests_mock.get(
+ base_url,
+ [
+ {"text": p1_text, "headers": p1_headers, "status_code": 200},
+ {"status_code": http_code},
+ {"text": p3_text, "headers": p3_headers, "status_code": 200},
+ ],
+ )
+
+ with pytest.raises(HTTPError):
+ lister.run()
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ check_listed_origins(
+ p1_origin_urls, scheduler_origins
+ ) # Only the first page is listed
+
+
+def test_gogs_incremental_lister(
+ swh_scheduler,
+ requests_mock,
+ mocker,
+ trygogs_p1,
+ trygogs_p2,
+ trygogs_p3,
+):
+ kwargs = dict(
+ url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret"
+ )
+ lister = GogsLister(scheduler=swh_scheduler, **kwargs)
+
+ lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page")
+
+ p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1
+ p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2
+
+ requests_mock.get(P1, text=p1_text, headers=p1_headers)
+ requests_mock.get(P2, text=p2_text, headers=p2_headers)
+ requests_mock.get(
+ P3, status_code=400
+ ) # Note: 3rd page throws 4xx in the first listing attempt
+ # First listing:
with pytest.raises(HTTPError):
lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- assert len(scheduler_origins) == 0
+
+ assert lister.updated
+ lister_state = lister.get_state_from_scheduler()
+ last_seen_next_link = lister_state.last_seen_next_link
+ assert last_seen_next_link == P3
+
+ check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
+
+ # Second listing attempt should restart from last state:
+ p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3
+ requests_mock.get(P3, text=p3_text, headers=p3_headers)
+
+ lister.session.get = mocker.spy(lister.session, "get")
+
+ stats_restart = lister.run()
+
+ assert stats_restart.pages == 1
+ assert stats_restart.origins == 3
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ page_id = _parse_page_id(last_seen_next_link)
+ query_params = lister.query_params
+ query_params["page"] = page_id
+
+ lister.session.get.assert_called_once_with(
+ TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params
+ )
+
+ # All the 9 origins (3 pages) should be passed on to the scheduler:
+ check_listed_origins(
+ p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins
+ )

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 3:12 PM (11 w, 15 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225304

Event Timeline