Changeset View
Standalone View
swh/lister/golang/lister.py
- This file was added.
# Copyright (C) 2022 The Software Heritage developers | ||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | ||||||||||||
from datetime import datetime | ||||||||||||
import json | ||||||||||||
import logging | ||||||||||||
from typing import Any, Iterator, List, Optional, Tuple | ||||||||||||
from dateutil import parser | ||||||||||||
vlorentz: Use `iso8601` instead, it's smaller and we already depend on it. RFC 3339 is a subset of ISO… | ||||||||||||
Done Inline Actionswe also moved away from dateutil parser which is way too lenient (and answers suprising results) ardumont: we also moved away from dateutil parser which is way too lenient (and answers suprising results) | ||||||||||||
import requests | ||||||||||||
from tenacity import before_sleep_log | ||||||||||||
from swh.lister.utils import retry_policy_generic, throttling_retry | ||||||||||||
from swh.scheduler.interface import SchedulerInterface | ||||||||||||
from swh.scheduler.model import ListedOrigin | ||||||||||||
from .. import USER_AGENT | ||||||||||||
from ..pattern import CredentialsType, StatelessLister | ||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||
GolangPageType = List[Any] | ||||||||||||
Done Inline Actions
(we could use a TypedDict to be even more specific, but I don't think it's worth it) vlorentz: (we could use a TypedDict to be even more specific, but I don't think it's worth it) | ||||||||||||
class GolangLister(StatelessLister[GolangPageType]): | ||||||||||||
""" | ||||||||||||
List all Golang modules and send associated origins to scheduler. | ||||||||||||
The lister queries the Golang module index, whose documentation can be found | ||||||||||||
at https://index.golang.org | ||||||||||||
""" | ||||||||||||
GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" | ||||||||||||
# `limit` seems to be... limited to 2000. | ||||||||||||
GOLANG_MODULES_INDEX_LIMIT = 2000 | ||||||||||||
LISTER_NAME = "Golang" | ||||||||||||
def __init__( | ||||||||||||
self, scheduler: SchedulerInterface, credentials: CredentialsType = None, | ||||||||||||
): | ||||||||||||
super().__init__( | ||||||||||||
scheduler=scheduler, | ||||||||||||
url=self.GOLANG_MODULES_INDEX_URL, | ||||||||||||
instance="Golang", | ||||||||||||
credentials=credentials, | ||||||||||||
) | ||||||||||||
self.session = requests.Session() | ||||||||||||
self.session.headers.update( | ||||||||||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | ||||||||||||
) | ||||||||||||
@throttling_retry( | ||||||||||||
retry=retry_policy_generic, | ||||||||||||
before_sleep=before_sleep_log(logger, logging.WARNING), | ||||||||||||
) | ||||||||||||
def api_request(self, url: str) -> List[str]: | ||||||||||||
logger.debug("Fetching URL %s", url) | ||||||||||||
response = self.session.get(url) | ||||||||||||
if response.status_code not in (200, 304): | ||||||||||||
# Log response content to ease debugging | ||||||||||||
logger.warning( | ||||||||||||
"Unexpected HTTP status code %s for URL %s", | ||||||||||||
response.status_code, | ||||||||||||
response.url, | ||||||||||||
) | ||||||||||||
response.raise_for_status() | ||||||||||||
return response.text.split() | ||||||||||||
def get_single_page( | ||||||||||||
self, since: Optional[datetime] = None | ||||||||||||
) -> Tuple[GolangPageType, Optional[datetime]]: | ||||||||||||
Done Inline ActionsPlease add a docstring to explain how since is used and the second returned value (I found it confusing while reviewing D8298) vlorentz: Please add a docstring to explain how `since` is used and the second returned value (I found it… | ||||||||||||
limit_arg = f"?limit={self.GOLANG_MODULES_INDEX_LIMIT}" | ||||||||||||
url = self.GOLANG_MODULES_INDEX_URL + limit_arg | ||||||||||||
Done Inline Actions
f-strings all the way ;) ardumont: f-strings all the way ;) | ||||||||||||
if since is not None: | ||||||||||||
# The Golang index does not understand `+00:00` for some reason | ||||||||||||
# and expects the "timezone zero" notation instead. This works | ||||||||||||
# because all times are UTC. | ||||||||||||
Done Inline ActionsPlease add an assertion that times are indeed UTC vlorentz: Please add an assertion that times are indeed UTC | ||||||||||||
as_date = since.isoformat().replace("+00:00", "Z") | ||||||||||||
url = url + f"&since={as_date}" | ||||||||||||
entries = self.api_request(url) | ||||||||||||
page: GolangPageType = [] | ||||||||||||
if not entries: | ||||||||||||
Done Inline Actionssame about f-string? ardumont: same about f-string? | ||||||||||||
return page, since | ||||||||||||
for as_json in entries: | ||||||||||||
entry = json.loads(as_json) | ||||||||||||
timestamp = parser.isoparse(entry["Timestamp"]) | ||||||||||||
# We've already parsed it and we'll need the datetime later, save it | ||||||||||||
entry["Timestamp"] = timestamp | ||||||||||||
page.append(entry) | ||||||||||||
# The index is guaranteed to be sorted in chronological order | ||||||||||||
since = timestamp | ||||||||||||
return page, since | ||||||||||||
def get_pages(self) -> Iterator[GolangPageType]: | ||||||||||||
page, since = self.get_single_page() | ||||||||||||
Not Done Inline ActionsI gave a shot to the lister in our docker environment and could succesfully list all go packages, Indeed the latest listed package timestamp will be reused over and and over as I think adding another termination condition by saving the last since value used in anlambert: I gave a shot to the lister in our docker environment and could succesfully list all go… | ||||||||||||
while page: | ||||||||||||
yield page | ||||||||||||
page, since = self.get_single_page(since=since) | ||||||||||||
def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: | ||||||||||||
""" | ||||||||||||
Iterate on all Golang projects and yield ListedOrigin instances. | ||||||||||||
""" | ||||||||||||
assert self.lister_obj.id is not None | ||||||||||||
for module in page: | ||||||||||||
path = module["Path"] | ||||||||||||
version = module["Version"] | ||||||||||||
# See https://proxy.golang.org for documentation on the proxy protocol | ||||||||||||
origin_url = f"{path}/@v/{version}.zip" | ||||||||||||
yield ListedOrigin( | ||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||
url=origin_url, | ||||||||||||
visit_type="golang", | ||||||||||||
last_update=module["Timestamp"], | ||||||||||||
) |
Use iso8601 instead, it's smaller and we already depend on it. RFC 3339 is a subset of ISO 8601, so it should be fine.