Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/rubygems/lister.py
- This file was added.
# Copyright (C) 2022 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import logging | |||||
from typing import Any, Dict, Iterator, List, Optional, Text | |||||
import requests | |||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.lister.utils import http_retry | |||||
from swh.scheduler.interface import SchedulerInterface | |||||
from swh.scheduler.model import ListedOrigin | |||||
from .. import __version__ | |||||
from ..pattern import CredentialsType, StatelessLister | |||||
logger = logging.getLogger(__name__) | |||||
USER_AGENT = ( | |||||
f"Software Heritage RubyGems Lister v{__version__} " | |||||
"(+https://www.softwareheritage.org/contact)" | |||||
) | |||||
# Aliasing the page results returned by `get_pages` method from the lister. | |||||
RubyGemsListerPage = Text | |||||
class RubyGemsLister(StatelessLister[RubyGemsListerPage]): | |||||
"""Lister for RubyGems.org, the Ruby community’s gem hosting service.""" | |||||
LISTER_NAME = "rubygems" | |||||
VISIT_TYPE = "rubygems" | |||||
INSTANCE = "rubygems" | |||||
INDEX_URL = "https://rubygems.org/versions" | |||||
def __init__( | |||||
self, | |||||
scheduler: SchedulerInterface, | |||||
credentials: Optional[CredentialsType] = None, | |||||
): | |||||
super().__init__( | |||||
scheduler=scheduler, | |||||
credentials=credentials, | |||||
instance=self.INSTANCE, | |||||
url=self.INDEX_URL, | |||||
) | |||||
self.session = requests.Session() | |||||
self.session.headers.update( | |||||
{ | |||||
"Accept": "text/plain", | |||||
"User-Agent": USER_AGENT, | |||||
} | |||||
) | |||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: | |||||
logger.info("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
def get_pages(self) -> Iterator[RubyGemsListerPage]: | |||||
"""Yield an iterator which returns 'page' | |||||
It uses the index file located at `https://rubygems.org/versions` | |||||
to get a list of package names. Each page returns an origin url based on | |||||
the following pattern:: | |||||
https://rubygems.org/gems/{pkgname} | |||||
""" | |||||
package_names: List[str] = [] | |||||
response = self.page_request(url=self.url, params={}) | |||||
data = response.content.decode() | |||||
# remove the first 3 lines (file headers + first package named '-') | |||||
for line in data.splitlines()[3:]: | |||||
package_names.append(line.split(" ")[0]) | |||||
# Remove duplicates | |||||
package_names_set: List[str] = list(set(package_names)) | |||||
for pkgname in package_names_set: | |||||
yield f"https://rubygems.org/gems/{pkgname}" | |||||
def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: | |||||
"""Iterate on all pages and yield ListedOrigin instances.""" | |||||
assert self.lister_obj.id is not None | |||||
yield ListedOrigin( | |||||
lister_id=self.lister_obj.id, | |||||
visit_type=self.VISIT_TYPE, | |||||
url=page, | |||||
last_update=None, | |||||
) |