Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/golang/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
from datetime import datetime | from datetime import datetime | ||||
import json | import json | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional, Tuple | from typing import Any, Dict, Iterator, List, Optional, Tuple | ||||
import iso8601 | import iso8601 | ||||
import requests | |||||
from tenacity import before_sleep_log | |||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
@dataclass | @dataclass | ||||
class GolangStateType: | class GolangStateType: | ||||
last_seen: Optional[datetime] = None | last_seen: Optional[datetime] = None | ||||
Show All 25 Lines | class GolangLister(Lister[GolangStateType, GolangPageType]): | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=self.GOLANG_MODULES_INDEX_URL, | url=self.GOLANG_MODULES_INDEX_URL, | ||||
instance=self.LISTER_NAME, | instance=self.LISTER_NAME, | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.session = requests.Session() | self.session.headers.update({"Accept": "application/json"}) | ||||
self.session.headers.update( | |||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | |||||
) | |||||
self.incremental = incremental | self.incremental = incremental | ||||
def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: | def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: | ||||
as_string = d.get("last_seen") | as_string = d.get("last_seen") | ||||
last_seen = iso8601.parse_date(as_string) if as_string is not None else None | last_seen = iso8601.parse_date(as_string) if as_string is not None else None | ||||
return GolangStateType(last_seen=last_seen) | return GolangStateType(last_seen=last_seen) | ||||
def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: | def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: | ||||
return { | return { | ||||
"last_seen": state.last_seen.isoformat() | "last_seen": state.last_seen.isoformat() | ||||
if state.last_seen is not None | if state.last_seen is not None | ||||
else None | else None | ||||
} | } | ||||
def finalize(self): | def finalize(self): | ||||
if self.incremental and self.state.last_seen is not None: | if self.incremental and self.state.last_seen is not None: | ||||
scheduler_state = self.get_state_from_scheduler() | scheduler_state = self.get_state_from_scheduler() | ||||
if ( | if ( | ||||
scheduler_state.last_seen is None | scheduler_state.last_seen is None | ||||
or self.state.last_seen > scheduler_state.last_seen | or self.state.last_seen > scheduler_state.last_seen | ||||
): | ): | ||||
self.updated = True | self.updated = True | ||||
@http_retry( | |||||
before_sleep=before_sleep_log(logger, logging.WARNING), | |||||
) | |||||
def api_request(self, url: str) -> List[str]: | def api_request(self, url: str) -> List[str]: | ||||
logger.debug("Fetching URL %s", url) | response = self.http_request(url) | ||||
response = self.session.get(url) | |||||
if response.status_code not in (200, 304): | |||||
# Log response content to ease debugging | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s for URL %s", | |||||
response.status_code, | |||||
response.url, | |||||
) | |||||
response.raise_for_status() | |||||
return response.text.split() | return response.text.split() | ||||
def get_single_page( | def get_single_page( | ||||
self, since: Optional[datetime] = None | self, since: Optional[datetime] = None | ||||
) -> Tuple[GolangPageType, Optional[datetime]]: | ) -> Tuple[GolangPageType, Optional[datetime]]: | ||||
"""Return a page from the API and the timestamp of its last entry. | """Return a page from the API and the timestamp of its last entry. | ||||
Since all entries are sorted by chronological order, the timestamp is useful | Since all entries are sorted by chronological order, the timestamp is useful | ||||
both for pagination and later for incremental runs.""" | both for pagination and later for incremental runs.""" | ||||
▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines |