Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/golang/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | |||||
from datetime import datetime | from datetime import datetime | ||||
import json | import json | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional, Tuple | from typing import Any, Dict, Iterator, List, Optional, Tuple | ||||
import iso8601 | import iso8601 | ||||
import requests | import requests | ||||
from tenacity import before_sleep_log | from tenacity import before_sleep_log | ||||
from swh.lister.utils import retry_policy_generic, throttling_retry | from swh.lister.utils import retry_policy_generic, throttling_retry | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | from .. import USER_AGENT | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
@dataclass | |||||
class GolangStateType: | |||||
last_seen: Optional[datetime] = None | |||||
"""Last timestamp of a package version we have saved. | |||||
Used as a starting point for an incremental listing.""" | |||||
GolangPageType = List[Dict[str, Any]] | GolangPageType = List[Dict[str, Any]] | ||||
class GolangLister(StatelessLister[GolangPageType]): | class GolangLister(Lister[GolangStateType, GolangPageType]): | ||||
""" | """ | ||||
List all Golang modules and send associated origins to scheduler. | List all Golang modules and send associated origins to scheduler. | ||||
The lister queries the Golang module index, whose documentation can be found | The lister queries the Golang module index, whose documentation can be found | ||||
at https://index.golang.org | at https://index.golang.org | ||||
""" | """ | ||||
GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" | GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" | ||||
# `limit` seems to be... limited to 2000. | # `limit` seems to be... limited to 2000. | ||||
GOLANG_MODULES_INDEX_LIMIT = 2000 | GOLANG_MODULES_INDEX_LIMIT = 2000 | ||||
LISTER_NAME = "Golang" | LISTER_NAME = "Golang" | ||||
def __init__( | def __init__( | ||||
self, scheduler: SchedulerInterface, credentials: CredentialsType = None, | self, | ||||
scheduler: SchedulerInterface, | |||||
incremental: bool = False, | |||||
credentials: CredentialsType = None, | |||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=self.GOLANG_MODULES_INDEX_URL, | url=self.GOLANG_MODULES_INDEX_URL, | ||||
instance="Golang", | instance="Golang", | ||||
anlambert: You should add an `incremental` parameter to allow full relisting if set to `False`. | |||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.session = requests.Session() | self.session = requests.Session() | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/json", "User-Agent": USER_AGENT} | {"Accept": "application/json", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
self.incremental = incremental | |||||
def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: | |||||
as_string = d.get("last_seen") | |||||
last_seen = iso8601.parse_date(as_string) if as_string is not None else None | |||||
return GolangStateType(last_seen=last_seen) | |||||
def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: | |||||
return { | |||||
"last_seen": state.last_seen.isoformat() | |||||
if state.last_seen is not None | |||||
else None | |||||
} | |||||
def finalize(self): | |||||
if self.incremental and self.state.last_seen is not None: | |||||
scheduler_state = self.get_state_from_scheduler() | |||||
if ( | |||||
scheduler_state.last_seen is None | |||||
or self.state.last_seen > scheduler_state.last_seen | |||||
): | |||||
self.updated = True | |||||
@throttling_retry( | @throttling_retry( | ||||
retry=retry_policy_generic, | retry=retry_policy_generic, | ||||
before_sleep=before_sleep_log(logger, logging.WARNING), | before_sleep=before_sleep_log(logger, logging.WARNING), | ||||
) | ) | ||||
def api_request(self, url: str) -> List[str]: | def api_request(self, url: str) -> List[str]: | ||||
logger.debug("Fetching URL %s", url) | logger.debug("Fetching URL %s", url) | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | ) -> Tuple[GolangPageType, Optional[datetime]]: | ||||
entry["Timestamp"] = timestamp | entry["Timestamp"] = timestamp | ||||
page.append(entry) | page.append(entry) | ||||
# The index is guaranteed to be sorted in chronological order | # The index is guaranteed to be sorted in chronological order | ||||
since = timestamp | since = timestamp | ||||
return page, since | return page, since | ||||
def get_pages(self) -> Iterator[GolangPageType]: | def get_pages(self) -> Iterator[GolangPageType]: | ||||
page, since = self.get_single_page() | since = None | ||||
last_since = since | if self.incremental: | ||||
while page: | since = self.state.last_seen | ||||
yield page | |||||
page, since = self.get_single_page(since=since) | page, since = self.get_single_page(since=since) | ||||
if last_since == since: | if since == self.state.last_seen: | ||||
# The index returns packages whose timestamp are greater or | # The index returns packages whose timestamp are greater or | ||||
# equal to the date provided as parameter, which will create | # equal to the date provided as parameter, which will create | ||||
# an infinite loop if not stopped here. | # an infinite loop if not stopped here. | ||||
return [] | return [], since | ||||
last_since = since | if since is not None: | ||||
self.state.last_seen = since | |||||
Not Done Inline ActionsYou should ignore the state here if the lister is not in incremental mode. anlambert: You should ignore the state here if the lister is not in incremental mode. | |||||
Done Inline ActionsI should ignore it for listing, but not for updating the state, correct? That way a full run will still save the last timestamp for the next incremental run. Alphare: I should ignore it for listing, but not for updating the state, correct? That way a full run… | |||||
Not Done Inline ActionsOther listers do not save any state when they are executed in non incremental mode so you should do the same imho. anlambert: Other listers do not save any state when they are executed in non incremental mode so you… | |||||
while page: | |||||
yield page | |||||
page, since = self.get_single_page(since=since) | |||||
if since == self.state.last_seen: | |||||
return [], since | |||||
if since is not None: | |||||
self.state.last_seen = since | |||||
def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: | ||||
""" | """ | ||||
Iterate on all Golang projects and yield ListedOrigin instances. | Iterate on all Golang projects and yield ListedOrigin instances. | ||||
""" | """ | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for module in page: | for module in page: | ||||
path = module["Path"] | path = module["Path"] | ||||
# The loader will be expected to use the golang proxy to do the | # The loader will be expected to use the golang proxy to do the | ||||
# actual downloading. We're using `pkg.go.dev` so that the URL points | # actual downloading. We're using `pkg.go.dev` so that the URL points | ||||
# to somewhere useful for a human instead of an (incomplete) API path. | # to somewhere useful for a human instead of an (incomplete) API path. | ||||
origin_url = f"https://pkg.go.dev/{path}" | origin_url = f"https://pkg.go.dev/{path}" | ||||
# Since the Go index lists versions and not just packages, there will | # Since the Go index lists versions and not just packages, there will | ||||
# be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, | # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, | ||||
# so only the last timestamp will be used, with no duplicates. | # so only the last timestamp will be used, with no duplicates. | ||||
# Performance should not be an issue as they are sent to the db in bulk. | # Performance should not be an issue as they are sent to the db in bulk. | ||||
Not Done Inline ActionsCould you use https://pkg.go.dev/{path} instead ? Current origin URL yields a 404 when trying to browse it, which is not great. For instance for the golang.org/x/text package:
anlambert: Could you use `https://pkg.go.dev/{path}` instead ?
Current origin URL yields a 404 when… | |||||
Done Inline ActionsSure, then the loader will need to expect those URLs and substitute them with the proxy prefix. If that's fine with you, I can do that change Alphare: Sure, then the loader will need to expect those URLs and substitute them with the proxy prefix. | |||||
Not Done Inline ActionsSounds good to me, thanks! anlambert: Sounds good to me, thanks! | |||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
url=origin_url, | url=origin_url, | ||||
visit_type="golang", | visit_type="golang", | ||||
last_update=module["Timestamp"], | last_update=module["Timestamp"], | ||||
) | ) |
You should add an incremental parameter to allow full relisting if set to False.