Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/rubygems/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import base64 | |||||
from datetime import timezone | |||||
import gzip | |||||
import logging | import logging | ||||
from typing import Iterator, List, Optional, Text | import os | ||||
import shutil | |||||
import subprocess | |||||
import tarfile | |||||
import tempfile | |||||
from typing import Any, Dict, Iterator, Optional, Tuple | |||||
from bs4 import BeautifulSoup | |||||
import psycopg2 | |||||
from testing.postgresql import Postgresql | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
# Aliasing the page results returned by `get_pages` method from the lister. | RubyGemsListerPage = Dict[str, Any] | ||||
RubyGemsListerPage = Text | |||||
class RubyGemsLister(StatelessLister[RubyGemsListerPage]): | class RubyGemsLister(StatelessLister[RubyGemsListerPage]): | ||||
"""Lister for RubyGems.org, the Ruby community’s gem hosting service.""" | """Lister for RubyGems.org, the Ruby community's gem hosting service. | ||||
Instead of querying rubygems.org Web API, it uses gems data from the | |||||
daily PostreSQL database dump of rubygems. It enables to gather all | |||||
interesting info about a gem and its release artifacts (version number, | |||||
download URL, checksums, release date) in an efficient way and without | |||||
flooding rubygems Web API with numerous HTTP requests (as there is more | |||||
than 187000 gems available on 07/10/2022). | |||||
""" | |||||
LISTER_NAME = "rubygems" | LISTER_NAME = "rubygems" | ||||
VISIT_TYPE = "rubygems" | VISIT_TYPE = "rubygems" | ||||
INSTANCE = "rubygems" | INSTANCE = "rubygems" | ||||
INDEX_URL = "https://rubygems.org/versions" | RUBY_GEMS_POSTGRES_DUMP_BASE_URL = ( | ||||
"https://s3-us-west-2.amazonaws.com/rubygems-dumps" | |||||
) | |||||
RUBY_GEMS_POSTGRES_DUMP_LIST_URL = ( | |||||
f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql" | |||||
) | |||||
RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem" | |||||
RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}" | |||||
RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = ( | |||||
"https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json" | |||||
) | |||||
DB_NAME = "rubygems" | |||||
DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz" | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=credentials, | credentials=credentials, | ||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
url=self.INDEX_URL, | url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, | ||||
) | ) | ||||
def get_pages(self) -> Iterator[RubyGemsListerPage]: | def get_latest_dump_file(self) -> str: | ||||
"""Yield an iterator which returns 'page' | response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL) | ||||
xml = BeautifulSoup(response.content, "xml") | |||||
It uses the index file located at `https://rubygems.org/versions` | contents = xml.find_all("Contents") | ||||
to get a list of package names. Each page returns an origin url based on | return contents[-1].find("Key").text | ||||
the following pattern:: | |||||
def create_rubygems_db( | |||||
https://rubygems.org/gems/{pkgname} | self, postgresql: Postgresql | ||||
) -> Tuple[str, psycopg2._psycopg.connection]: | |||||
""" | logger.debug("Creating rubygems database") | ||||
package_names: List[str] = [] | db_dsn = postgresql.dsn() | ||||
response = self.http_request(url=self.url) | db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME) | ||||
data = response.content.decode() | db = psycopg2.connect(**db_dsn) | ||||
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) | |||||
# remove the first 3 lines (file headers + first package named '-') | with db.cursor() as cursor: | ||||
for line in data.splitlines()[3:]: | cursor.execute(f"CREATE DATABASE {self.DB_NAME}") | ||||
package_names.append(line.split(" ")[0]) | |||||
db_dsn["database"] = self.DB_NAME | |||||
db = psycopg2.connect(**db_dsn) | |||||
db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) | |||||
with db.cursor() as cursor: | |||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore") | |||||
return db_url, db | |||||
def populate_rubygems_db(self, db_url: str): | |||||
dump_file = self.get_latest_dump_file() | |||||
dump_id = dump_file.split("/")[2] | |||||
response = self.http_request(f"{self.url}/{dump_file}", stream=True) | |||||
with tempfile.TemporaryDirectory() as temp_dir: | |||||
logger.debug( | |||||
"Downloading latest rubygems database dump: %s (%s bytes)", | |||||
dump_id, | |||||
response.headers["content-length"], | |||||
) | |||||
dump_file = os.path.join(temp_dir, "rubygems_dump.tar") | |||||
with open(dump_file, "wb") as dump: | |||||
for chunk in response.iter_content(chunk_size=1024): | |||||
dump.write(chunk) | |||||
with tarfile.open(dump_file) as dump_tar: | |||||
dump_tar.extractall(temp_dir) | |||||
logger.debug("Populating rubygems database with dump %s", dump_id) | |||||
psql = subprocess.Popen( | |||||
["psql", "-q", db_url], | |||||
stdin=subprocess.PIPE, | |||||
) | |||||
# Remove duplicates | # passing value of gzip.open as stdin of subprocess.run makes the process | ||||
package_names_set: List[str] = list(set(package_names)) | # read raw data instead of decompressed data so we have to use a pipe | ||||
with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql: | |||||
shutil.copyfileobj(sql, psql.stdin) # type: ignore | |||||
# denote end of read file | |||||
psql.stdin.close() # type: ignore | |||||
psql.wait() | |||||
for pkgname in package_names_set: | def get_pages(self) -> Iterator[RubyGemsListerPage]: | ||||
yield f"https://rubygems.org/gems/{pkgname}" | # spawn a temporary postgres instance (require initdb executable in environment) | ||||
with Postgresql() as postgresql: | |||||
db_url, db = self.create_rubygems_db(postgresql) | |||||
self.populate_rubygems_db(db_url) | |||||
with db.cursor() as cursor: | |||||
cursor.execute("SELECT id, name from rubygems") | |||||
for gem_id, gem_name in cursor.fetchall(): | |||||
logger.debug("Processing gem named %s", gem_name[1]) | |||||
with db.cursor() as cursor_v: | |||||
cursor_v.execute( | |||||
"SELECT authors, built_at, number, sha256, size from versions " | |||||
"where rubygem_id = %s", | |||||
(gem_id,), | |||||
) | |||||
versions = [ | |||||
{ | |||||
"number": number, | |||||
"url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format( | |||||
gem=gem_name, version=number | |||||
), | |||||
"date": built_at.replace(tzinfo=timezone.utc), | |||||
"authors": authors, | |||||
"sha256": ( | |||||
base64.decodebytes(sha256.encode()).hex() | |||||
if sha256 | |||||
else None | |||||
), | |||||
"size": size, | |||||
} | |||||
for authors, built_at, number, sha256, size in cursor_v.fetchall() | |||||
] | |||||
if versions: | |||||
yield { | |||||
"name": gem_name, | |||||
"versions": versions, | |||||
} | |||||
vlorentz: unpacking makes it more readable IMO
(also, this avoids a interpolating the SQL query, even if… | |||||
Done Inline ActionsBetter indeed, thanks ! anlambert: Better indeed, thanks ! | |||||
def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: | ||||
"""Iterate on all pages and yield ListedOrigin instances.""" | |||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
artifacts = [] | |||||
rubygem_metadata = [] | |||||
for version in page["versions"]: | |||||
artifacts.append( | |||||
{ | |||||
"version": version["number"], | |||||
"filename": version["url"].split("/")[-1], | |||||
"url": version["url"], | |||||
"checksums": ( | |||||
{"sha256": version["sha256"]} if version["sha256"] else {} | |||||
), | |||||
"length": version["size"], | |||||
} | |||||
) | |||||
rubygem_metadata.append( | |||||
{ | |||||
"version": version["number"], | |||||
"date": version["date"].isoformat(), | |||||
"authors": version["authors"], | |||||
"extrinsic_metadata_url": ( | |||||
self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format( | |||||
gem=page["name"], version=version["number"] | |||||
) | |||||
), | |||||
} | |||||
) | |||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
url=page, | url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]), | ||||
last_update=None, | last_update=max(version["date"] for version in page["versions"]), | ||||
extra_loader_arguments={ | |||||
"artifacts": artifacts, | |||||
"rubygem_metadata": rubygem_metadata, | |||||
}, | |||||
) | ) |
unpacking makes it more readable IMO
(also, this avoids a interpolating the SQL query, even if it shouldn't be an issue)