Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/rubygems/lister.py
| # Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| import base64 | |||||
| from datetime import timezone | |||||
| import gzip | |||||
| import logging | import logging | ||||
| from typing import Iterator, List, Optional, Text | import os | ||||
| import shutil | |||||
| import subprocess | |||||
| import tarfile | |||||
| import tempfile | |||||
| from typing import Any, Dict, Iterator, Optional, Tuple | |||||
| from bs4 import BeautifulSoup | |||||
| import psycopg2 | |||||
| from testing.postgresql import Postgresql | |||||
| from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
| from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
| from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
| # Aliasing the page results returned by `get_pages` method from the lister. | RubyGemsListerPage = Dict[str, Any] | ||||
| RubyGemsListerPage = Text | |||||
| class RubyGemsLister(StatelessLister[RubyGemsListerPage]): | class RubyGemsLister(StatelessLister[RubyGemsListerPage]): | ||||
| """Lister for RubyGems.org, the Ruby community’s gem hosting service.""" | """Lister for RubyGems.org, the Ruby community's gem hosting service. | ||||
| Instead of querying rubygems.org Web API, it uses gems data from the | |||||
| daily PostreSQL database dump of rubygems. It enables to gather all | |||||
| interesting info about a gem and its release artifacts (version number, | |||||
| download URL, checksums, release date) in an efficient way and without | |||||
| flooding rubygems Web API with numerous HTTP requests (as there is more | |||||
| than 187000 gems available on 07/10/2022). | |||||
| """ | |||||
| LISTER_NAME = "rubygems" | LISTER_NAME = "rubygems" | ||||
| VISIT_TYPE = "rubygems" | VISIT_TYPE = "rubygems" | ||||
| INSTANCE = "rubygems" | INSTANCE = "rubygems" | ||||
| INDEX_URL = "https://rubygems.org/versions" | RUBY_GEMS_POSTGRES_DUMP_BASE_URL = ( | ||||
| "https://s3-us-west-2.amazonaws.com/rubygems-dumps" | |||||
| ) | |||||
| RUBY_GEMS_POSTGRES_DUMP_LIST_URL = ( | |||||
| f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql" | |||||
| ) | |||||
| RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem" | |||||
| RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}" | |||||
| RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = ( | |||||
| "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json" | |||||
| ) | |||||
| DB_NAME = "rubygems" | |||||
| DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz" | |||||
| def __init__( | def __init__( | ||||
| self, | self, | ||||
| scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
| credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
| ): | ): | ||||
| super().__init__( | super().__init__( | ||||
| scheduler=scheduler, | scheduler=scheduler, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
| url=self.INDEX_URL, | url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, | ||||
| ) | ) | ||||
| def get_pages(self) -> Iterator[RubyGemsListerPage]: | def get_latest_dump_file(self) -> str: | ||||
| """Yield an iterator which returns 'page' | response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL) | ||||
| xml = BeautifulSoup(response.content, "xml") | |||||
| It uses the index file located at `https://rubygems.org/versions` | contents = xml.find_all("Contents") | ||||
| to get a list of package names. Each page returns an origin url based on | return contents[-1].find("Key").text | ||||
| the following pattern:: | |||||
| def create_rubygems_db( | |||||
| https://rubygems.org/gems/{pkgname} | self, postgresql: Postgresql | ||||
| ) -> Tuple[str, psycopg2._psycopg.connection]: | |||||
| """ | logger.debug("Creating rubygems database") | ||||
| package_names: List[str] = [] | db_dsn = postgresql.dsn() | ||||
| response = self.http_request(url=self.url) | db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME) | ||||
| data = response.content.decode() | db = psycopg2.connect(**db_dsn) | ||||
| db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) | |||||
| # remove the first 3 lines (file headers + first package named '-') | with db.cursor() as cursor: | ||||
| for line in data.splitlines()[3:]: | cursor.execute(f"CREATE DATABASE {self.DB_NAME}") | ||||
| package_names.append(line.split(" ")[0]) | |||||
| db_dsn["database"] = self.DB_NAME | |||||
| db = psycopg2.connect(**db_dsn) | |||||
| db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) | |||||
| with db.cursor() as cursor: | |||||
| cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore") | |||||
| return db_url, db | |||||
| def populate_rubygems_db(self, db_url: str): | |||||
| dump_file = self.get_latest_dump_file() | |||||
| dump_id = dump_file.split("/")[2] | |||||
| response = self.http_request(f"{self.url}/{dump_file}", stream=True) | |||||
| with tempfile.TemporaryDirectory() as temp_dir: | |||||
| logger.debug( | |||||
| "Downloading latest rubygems database dump: %s (%s bytes)", | |||||
| dump_id, | |||||
| response.headers["content-length"], | |||||
| ) | |||||
| dump_file = os.path.join(temp_dir, "rubygems_dump.tar") | |||||
| with open(dump_file, "wb") as dump: | |||||
| for chunk in response.iter_content(chunk_size=1024): | |||||
| dump.write(chunk) | |||||
| with tarfile.open(dump_file) as dump_tar: | |||||
| dump_tar.extractall(temp_dir) | |||||
| logger.debug("Populating rubygems database with dump %s", dump_id) | |||||
| psql = subprocess.Popen( | |||||
| ["psql", "-q", db_url], | |||||
| stdin=subprocess.PIPE, | |||||
| ) | |||||
| # Remove duplicates | # passing value of gzip.open as stdin of subprocess.run makes the process | ||||
| package_names_set: List[str] = list(set(package_names)) | # read raw data instead of decompressed data so we have to use a pipe | ||||
| with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql: | |||||
| shutil.copyfileobj(sql, psql.stdin) # type: ignore | |||||
| # denote end of read file | |||||
| psql.stdin.close() # type: ignore | |||||
| psql.wait() | |||||
| for pkgname in package_names_set: | def get_pages(self) -> Iterator[RubyGemsListerPage]: | ||||
| yield f"https://rubygems.org/gems/{pkgname}" | # spawn a temporary postgres instance (require initdb executable in environment) | ||||
| with Postgresql() as postgresql: | |||||
| db_url, db = self.create_rubygems_db(postgresql) | |||||
| self.populate_rubygems_db(db_url) | |||||
| with db.cursor() as cursor: | |||||
| cursor.execute("SELECT id, name from rubygems") | |||||
| for gem_id, gem_name in cursor.fetchall(): | |||||
| logger.debug("Processing gem named %s", gem_name[1]) | |||||
| with db.cursor() as cursor_v: | |||||
| cursor_v.execute( | |||||
| "SELECT authors, built_at, number, sha256, size from versions " | |||||
| "where rubygem_id = %s", | |||||
| (gem_id,), | |||||
| ) | |||||
| versions = [ | |||||
| { | |||||
| "number": number, | |||||
| "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format( | |||||
| gem=gem_name, version=number | |||||
| ), | |||||
| "date": built_at.replace(tzinfo=timezone.utc), | |||||
| "authors": authors, | |||||
| "sha256": ( | |||||
| base64.decodebytes(sha256.encode()).hex() | |||||
| if sha256 | |||||
| else None | |||||
| ), | |||||
| "size": size, | |||||
| } | |||||
| for authors, built_at, number, sha256, size in cursor_v.fetchall() | |||||
| ] | |||||
| if versions: | |||||
| yield { | |||||
| "name": gem_name, | |||||
| "versions": versions, | |||||
| } | |||||
vlorentz: unpacking makes it more readable IMO
(also, this avoids a interpolating the SQL query, even if… | |||||
Done Inline ActionsBetter indeed, thanks ! anlambert: Better indeed, thanks ! | |||||
| def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: | ||||
| """Iterate on all pages and yield ListedOrigin instances.""" | |||||
| assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
| artifacts = [] | |||||
| rubygem_metadata = [] | |||||
| for version in page["versions"]: | |||||
| artifacts.append( | |||||
| { | |||||
| "version": version["number"], | |||||
| "filename": version["url"].split("/")[-1], | |||||
| "url": version["url"], | |||||
| "checksums": ( | |||||
| {"sha256": version["sha256"]} if version["sha256"] else {} | |||||
| ), | |||||
| "length": version["size"], | |||||
| } | |||||
| ) | |||||
| rubygem_metadata.append( | |||||
| { | |||||
| "version": version["number"], | |||||
| "date": version["date"].isoformat(), | |||||
| "authors": version["authors"], | |||||
| "extrinsic_metadata_url": ( | |||||
| self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format( | |||||
| gem=page["name"], version=version["number"] | |||||
| ) | |||||
| ), | |||||
| } | |||||
| ) | |||||
| yield ListedOrigin( | yield ListedOrigin( | ||||
| lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
| visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
| url=page, | url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]), | ||||
| last_update=None, | last_update=max(version["date"] for version in page["versions"]), | ||||
| extra_loader_arguments={ | |||||
| "artifacts": artifacts, | |||||
| "rubygem_metadata": rubygem_metadata, | |||||
| }, | |||||
| ) | ) | ||||
unpacking makes it more readable IMO
(also, this avoids a interpolating the SQL query, even if it shouldn't be an issue)