Differential D8639 Diff 31214 swh/lister/rubygems/lister.py

Changeset View

Standalone View

View Options

swh/lister/rubygems/lister.py

# See the AUTHORS file at the top-level directory of this distribution

# License: GNU General Public License version 3, or any later version

# See top-level LICENSE file for more information

import base64

from datetime import timezone

import gzip

import logging

from typing import Iterator, List, Optional, Text

import os

import shutil

import subprocess

import tarfile

import tempfile

from typing import Any, Dict, Iterator, Optional, Tuple

from bs4 import BeautifulSoup

import psycopg2

from testing.postgresql import Postgresql

from swh.scheduler.interface import SchedulerInterface

from swh.scheduler.model import ListedOrigin

from ..pattern import CredentialsType, StatelessLister

logger = logging.getLogger(__name__)

# Aliasing the page results returned by `get_pages` method from the lister.

RubyGemsListerPage = Dict[str, Any]

RubyGemsListerPage = Text

class RubyGemsLister(StatelessLister[RubyGemsListerPage]):

"""Lister for RubyGems.org, the Ruby community’s gem hosting service."""

"""Lister for RubyGems.org, the Ruby community's gem hosting service.

Instead of querying rubygems.org Web API, it uses gems data from the

daily PostreSQL database dump of rubygems. It enables to gather all

interesting info about a gem and its release artifacts (version number,

download URL, checksums, release date) in an efficient way and without

flooding rubygems Web API with numerous HTTP requests (as there is more

than 187000 gems available on 07/10/2022).

"""

LISTER_NAME = "rubygems"

VISIT_TYPE = "rubygems"

INSTANCE = "rubygems"

INDEX_URL = "https://rubygems.org/versions"

RUBY_GEMS_POSTGRES_DUMP_BASE_URL = (

"https://s3-us-west-2.amazonaws.com/rubygems-dumps"

)

RUBY_GEMS_POSTGRES_DUMP_LIST_URL = (

f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql"

)

RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem"

RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}"

RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = (

"https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json"

)

DB_NAME = "rubygems"

DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz"

def __init__(

self,

scheduler: SchedulerInterface,

credentials: Optional[CredentialsType] = None,

super().__init__(

scheduler=scheduler,

credentials=credentials,

instance=self.INSTANCE,

url=self.INDEX_URL,

url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL,

)

def get_pages(self) -> Iterator[RubyGemsListerPage]:

def get_latest_dump_file(self) -> str:

"""Yield an iterator which returns 'page'

response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)

xml = BeautifulSoup(response.content, "xml")

It uses the index file located at `https://rubygems.org/versions`

contents = xml.find_all("Contents")

to get a list of package names. Each page returns an origin url based on

return contents[-1].find("Key").text

the following pattern::

def create_rubygems_db(

https://rubygems.org/gems/{pkgname}

self, postgresql: Postgresql

) -> Tuple[str, psycopg2._psycopg.connection]:

"""

logger.debug("Creating rubygems database")

package_names: List[str] = []

db_dsn = postgresql.dsn()

response = self.http_request(url=self.url)

db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)

data = response.content.decode()

db = psycopg2.connect(**db_dsn)

db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

# remove the first 3 lines (file headers + first package named '-')

with db.cursor() as cursor:

for line in data.splitlines()[3:]:

cursor.execute(f"CREATE DATABASE {self.DB_NAME}")

package_names.append(line.split(" ")[0])

db_dsn["database"] = self.DB_NAME

db = psycopg2.connect(**db_dsn)

db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

with db.cursor() as cursor:

cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")

return db_url, db

def populate_rubygems_db(self, db_url: str):

dump_file = self.get_latest_dump_file()

dump_id = dump_file.split("/")[2]

response = self.http_request(f"{self.url}/{dump_file}", stream=True)

with tempfile.TemporaryDirectory() as temp_dir:

logger.debug(

"Downloading latest rubygems database dump: %s (%s bytes)",

dump_id,

response.headers["content-length"],

)

dump_file = os.path.join(temp_dir, "rubygems_dump.tar")

with open(dump_file, "wb") as dump:

for chunk in response.iter_content(chunk_size=1024):

dump.write(chunk)

with tarfile.open(dump_file) as dump_tar:

dump_tar.extractall(temp_dir)

logger.debug("Populating rubygems database with dump %s", dump_id)

psql = subprocess.Popen(

["psql", "-q", db_url],

stdin=subprocess.PIPE,

)

# Remove duplicates

# passing value of gzip.open as stdin of subprocess.run makes the process

package_names_set: List[str] = list(set(package_names))

# read raw data instead of decompressed data so we have to use a pipe

with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql:

shutil.copyfileobj(sql, psql.stdin) # type: ignore

# denote end of read file

psql.stdin.close() # type: ignore

psql.wait()

for pkgname in package_names_set:

def get_pages(self) -> Iterator[RubyGemsListerPage]:

yield f"https://rubygems.org/gems/{pkgname}"

# spawn a temporary postgres instance (require initdb executable in environment)

with Postgresql() as postgresql:

db_url, db = self.create_rubygems_db(postgresql)

self.populate_rubygems_db(db_url)

with db.cursor() as cursor:

cursor.execute("SELECT id, name from rubygems")

for gem_id, gem_name in cursor.fetchall():

logger.debug("Processing gem named %s", gem_name[1])

with db.cursor() as cursor_v:

cursor_v.execute(

"SELECT authors, built_at, number, sha256, size from versions "

"where rubygem_id = %s",

(gem_id,),

)

versions = [

{

"number": number,

"url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format(

gem=gem_name, version=number

"date": built_at.replace(tzinfo=timezone.utc),

"authors": authors,

"sha256": (

base64.decodebytes(sha256.encode()).hex()

if sha256

else None

"size": size,

}

for authors, built_at, number, sha256, size in cursor_v.fetchall()

]

if versions:

yield {

"name": gem_name,

"versions": versions,

}

vlorentzUnsubmitted

Not Done

cursor.execute("SELECT id, name from rubygems")

- for row_gem in cursor.fetchall():

- logger.debug("Processing gem named %s", row_gem[1])

+ for (gem_id, gem_name) in cursor.fetchall():

+ logger.debug("Processing gem named %s", gem_name)

with db.cursor() as cursor_v:

cursor_v.execute(

"SELECT authors, built_at, number, sha256, size from versions "

- f"where rubygem_id = {row_gem[0]}",

+ "where rubygem_id = %s",

+ (gem_id,)

)

versions = [

{

- "number": row[2],

+ "number": version,

"url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format(

- gem=row_gem[1], version=row[2]

+ gem=gem_name, version=version

- "date": row[1].replace(tzinfo=timezone.utc),

+ "date": date.replace(tzinfo=timezone.utc),

"authors": row[0],

"sha256": (

- base64.decodebytes(row[3].encode()).hex()

- if row[3]

+ base64.decodebytes(hash_.encode()).hex()

+ if hash_

else None

"size": row[4],

}

- for row in cursor_v.fetchall()

+ for (authors, date, version, hash_) in cursor_v.fetchall()

]

if versions:

yield {

"origin_url": self.RUBY_GEM_ORIGIN_URL_PATTERN.format(

- gem=row_gem[1]

+ gem=gem_name

"versions": versions,

}

def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:

unpacking makes it more readable IMO

(also, this avoids a interpolating the SQL query, even if it shouldn't be an issue)

vlorentz: unpacking makes it more readable IMO (also, this avoids a interpolating the SQL query, even if…

anlambertAuthorUnsubmitted

Done

Better indeed, thanks !

anlambert: Better indeed, thanks !

def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:

"""Iterate on all pages and yield ListedOrigin instances."""

assert self.lister_obj.id is not None

artifacts = []

rubygem_metadata = []

for version in page["versions"]:

artifacts.append(

{

"version": version["number"],

"filename": version["url"].split("/")[-1],

"url": version["url"],

"checksums": (

{"sha256": version["sha256"]} if version["sha256"] else {}

"length": version["size"],

}

)

rubygem_metadata.append(

{

"version": version["number"],

"date": version["date"].isoformat(),

"authors": version["authors"],

"extrinsic_metadata_url": (

self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format(

gem=page["name"], version=version["number"]

)

}

)

yield ListedOrigin(

lister_id=self.lister_obj.id,

visit_type=self.VISIT_TYPE,

url=page,

url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]),

last_update=None,

last_update=max(version["date"] for version in page["versions"]),

extra_loader_arguments={

"artifacts": artifacts,

"rubygem_metadata": rubygem_metadata,

)