diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -11,7 +11,7 @@ import iso8601 -from swh.core.github.utils import GitHubSession, MissingRateLimitReset +from swh.core.github.utils import MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -78,6 +78,7 @@ credentials=credentials, url=self.API_URL, instance="github", + with_github_session=True, ) self.first_id = first_id @@ -85,11 +86,6 @@ self.relisting = self.first_id is not None or self.last_id is not None - self.github_session = GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) - def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: return GitHubListerState(**d) @@ -109,6 +105,7 @@ logger.debug("Getting page %s", current_url) try: + assert self.github_session is not None response = self.github_session.request(current_url) except MissingRateLimitReset: # Give up diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -141,7 +141,7 @@ caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler) - assert lister.github_session.anonymous + assert lister.github_session is not None and lister.github_session.anonymous assert "using anonymous mode" in caplog.records[-1].message caplog.clear() diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -14,7 +14,6 @@ import lxml import requests -from swh.core.github.utils import GitHubSession from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -88,15 +87,12 @@ credentials=credentials, url=url, instance=instance, + with_github_session=True, ) self.session.headers.update({"Accept": "application/json"}) self.jar_origins: Dict[str, ListedOrigin] = {} - self.github_session = GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) @@ -294,6 +290,7 @@ return None if url and visit_type == "git": + assert self.github_session is not None # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -29,7 +29,6 @@ import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError -from swh.core.github.utils import GitHubSession from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT from swh.lister import TARBALL_EXTENSIONS from swh.lister.pattern import CredentialsType, StatelessLister @@ -331,6 +330,7 @@ url=url.rstrip("/"), instance=instance, credentials=credentials, + with_github_session=canonicalize, ) # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? @@ -338,16 +338,6 @@ self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore self.session = requests.Session() - # for testing purposes, we may want to skip this step (e.g. docker run and rate - # limit) - self.github_session = ( - GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) - if canonicalize - else None - ) def build_artifact( self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -11,7 +11,6 @@ import iso8601 import requests -from swh.core.github.utils import GitHubSession from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,14 +59,11 @@ url=self.PACKAGIST_PACKAGES_LIST_URL, instance="packagist", credentials=credentials, + with_github_session=True, ) self.session.headers.update({"Accept": "application/json"}) self.listing_date = datetime.now().astimezone(tz=timezone.utc) - self.github_session = GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: last_listing_date = d.get("last_listing_date") @@ -152,6 +148,7 @@ if visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical # ones + assert self.github_session is not None origin_url = ( self.github_session.get_canonical_url(origin_url) or origin_url ) diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -14,6 +14,7 @@ from tenacity.before_sleep import before_sleep_log from swh.core.config import load_from_envvar +from swh.core.github.utils import GitHubSession from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface @@ -93,6 +94,7 @@ """ LISTER_NAME: str = "" + github_session: Optional[GitHubSession] = None def __init__( self, @@ -100,6 +102,7 @@ url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + with_github_session: bool = False, ): if not self.LISTER_NAME: raise ValueError("Must set the LISTER_NAME attribute on Lister classes") @@ -127,6 +130,14 @@ self.session.headers.update( {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME} ) + self.github_session: Optional[GitHubSession] = ( + GitHubSession( + credentials=credentials.get("github", {}).get("github", []), + user_agent=str(self.session.headers["User-Agent"]), + ) + if with_github_session + else None + ) self.recorded_origins: Set[str] = set()