Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123363
D8639.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
20 KB
Subscribers
None
D8639.diff
View Options
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -42,3 +42,9 @@
[mypy-dulwich.*]
ignore_missing_imports = True
+
+[mypy-testing.postgresql.*]
+ignore_missing_imports = True
+
+[mypy-psycopg2.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@
tenacity >= 6.2
lxml
dulwich
+testing.postgresql
+psycopg2
diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py
--- a/swh/lister/rubygems/lister.py
+++ b/swh/lister/rubygems/lister.py
@@ -3,8 +3,20 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import base64
+from datetime import timezone
+import gzip
import logging
-from typing import Iterator, List, Optional, Text
+import os
+import shutil
+import subprocess
+import tarfile
+import tempfile
+from typing import Any, Dict, Iterator, Optional, Tuple
+
+from bs4 import BeautifulSoup
+import psycopg2
+from testing.postgresql import Postgresql
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -13,18 +25,39 @@
logger = logging.getLogger(__name__)
-# Aliasing the page results returned by `get_pages` method from the lister.
-RubyGemsListerPage = Text
+RubyGemsListerPage = Dict[str, Any]
class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
- """Lister for RubyGems.org, the Ruby community’s gem hosting service."""
+ """Lister for RubyGems.org, the Ruby community's gem hosting service.
+
+ Instead of querying rubygems.org Web API, it uses gems data from the
+ daily PostreSQL database dump of rubygems. It enables to gather all
+ interesting info about a gem and its release artifacts (version number,
+ download URL, checksums, release date) in an efficient way and without
+ flooding rubygems Web API with numerous HTTP requests (as there is more
+ than 187000 gems available on 07/10/2022).
+ """
LISTER_NAME = "rubygems"
VISIT_TYPE = "rubygems"
INSTANCE = "rubygems"
- INDEX_URL = "https://rubygems.org/versions"
+ RUBY_GEMS_POSTGRES_DUMP_BASE_URL = (
+ "https://s3-us-west-2.amazonaws.com/rubygems-dumps"
+ )
+ RUBY_GEMS_POSTGRES_DUMP_LIST_URL = (
+ f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql"
+ )
+
+ RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem"
+ RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}"
+ RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = (
+ "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json"
+ )
+
+ DB_NAME = "rubygems"
+ DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz"
def __init__(
self,
@@ -35,41 +68,147 @@
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
- url=self.INDEX_URL,
+ url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL,
)
- def get_pages(self) -> Iterator[RubyGemsListerPage]:
- """Yield an iterator which returns 'page'
-
- It uses the index file located at `https://rubygems.org/versions`
- to get a list of package names. Each page returns an origin url based on
- the following pattern::
-
- https://rubygems.org/gems/{pkgname}
+ def get_latest_dump_file(self) -> str:
+ response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
+ xml = BeautifulSoup(response.content, "xml")
+ contents = xml.find_all("Contents")
+ return contents[-1].find("Key").text
+
+ def create_rubygems_db(
+ self, postgresql: Postgresql
+ ) -> Tuple[str, psycopg2._psycopg.connection]:
+ logger.debug("Creating rubygems database")
+
+ db_dsn = postgresql.dsn()
+ db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
+ db = psycopg2.connect(**db_dsn)
+ db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+ with db.cursor() as cursor:
+ cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
+
+ db_dsn["database"] = self.DB_NAME
+
+ db = psycopg2.connect(**db_dsn)
+ db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+ with db.cursor() as cursor:
+ cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")
+
+ return db_url, db
+
+ def populate_rubygems_db(self, db_url: str):
+ dump_file = self.get_latest_dump_file()
+ dump_id = dump_file.split("/")[2]
+
+ response = self.http_request(f"{self.url}/{dump_file}", stream=True)
+
+ with tempfile.TemporaryDirectory() as temp_dir:
+ logger.debug(
+ "Downloading latest rubygems database dump: %s (%s bytes)",
+ dump_id,
+ response.headers["content-length"],
+ )
+ dump_file = os.path.join(temp_dir, "rubygems_dump.tar")
+ with open(dump_file, "wb") as dump:
+ for chunk in response.iter_content(chunk_size=1024):
+ dump.write(chunk)
+
+ with tarfile.open(dump_file) as dump_tar:
+ dump_tar.extractall(temp_dir)
+
+ logger.debug("Populating rubygems database with dump %s", dump_id)
+ psql = subprocess.Popen(
+ ["psql", "-q", db_url],
+ stdin=subprocess.PIPE,
+ )
+
+ # passing value of gzip.open as stdin of subprocess.run makes the process
+ # read raw data instead of decompressed data so we have to use a pipe
+ with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql:
+ shutil.copyfileobj(sql, psql.stdin) # type: ignore
+
+ # denote end of read file
+ psql.stdin.close() # type: ignore
+ psql.wait()
- """
-
- package_names: List[str] = []
- response = self.http_request(url=self.url)
- data = response.content.decode()
-
- # remove the first 3 lines (file headers + first package named '-')
- for line in data.splitlines()[3:]:
- package_names.append(line.split(" ")[0])
-
- # Remove duplicates
- package_names_set: List[str] = list(set(package_names))
-
- for pkgname in package_names_set:
- yield f"https://rubygems.org/gems/{pkgname}"
+ def get_pages(self) -> Iterator[RubyGemsListerPage]:
+ # spawn a temporary postgres instance (require initdb executable in environment)
+ with Postgresql() as postgresql:
+ db_url, db = self.create_rubygems_db(postgresql)
+ self.populate_rubygems_db(db_url)
+
+ with db.cursor() as cursor:
+ cursor.execute("SELECT id, name from rubygems")
+ for gem_id, gem_name in cursor.fetchall():
+ logger.debug("Processing gem named %s", gem_name[1])
+ with db.cursor() as cursor_v:
+ cursor_v.execute(
+ "SELECT authors, built_at, number, sha256, size from versions "
+ "where rubygem_id = %s",
+ (gem_id,),
+ )
+ versions = [
+ {
+ "number": number,
+ "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format(
+ gem=gem_name, version=number
+ ),
+ "date": built_at.replace(tzinfo=timezone.utc),
+ "authors": authors,
+ "sha256": (
+ base64.decodebytes(sha256.encode()).hex()
+ if sha256
+ else None
+ ),
+ "size": size,
+ }
+ for authors, built_at, number, sha256, size in cursor_v.fetchall()
+ ]
+ if versions:
+ yield {
+ "name": gem_name,
+ "versions": versions,
+ }
def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
- """Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
+ artifacts = []
+ rubygem_metadata = []
+ for version in page["versions"]:
+ artifacts.append(
+ {
+ "version": version["number"],
+ "filename": version["url"].split("/")[-1],
+ "url": version["url"],
+ "checksums": (
+ {"sha256": version["sha256"]} if version["sha256"] else {}
+ ),
+ "length": version["size"],
+ }
+ )
+ rubygem_metadata.append(
+ {
+ "version": version["number"],
+ "date": version["date"].isoformat(),
+ "authors": version["authors"],
+ "extrinsic_metadata_url": (
+ self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format(
+ gem=page["name"], version=version["number"]
+ )
+ ),
+ }
+ )
+
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
- url=page,
- last_update=None,
+ url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]),
+ last_update=max(version["date"] for version in page["versions"]),
+ extra_loader_arguments={
+ "artifacts": artifacts,
+ "rubygem_metadata": rubygem_metadata,
+ },
)
diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions
deleted file mode 100644
--- a/swh/lister/rubygems/tests/data/https_rubygems.org/versions
+++ /dev/null
@@ -1,6 +0,0 @@
-created_at: 2022-09-01T00:00:05Z
----
-- 1 05d0116933ba44b0b5d0ee19bfd35ccc
-mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260
-mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22
-mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd
diff --git a/swh/lister/rubygems/tests/data/rubygems_dumps.xml b/swh/lister/rubygems/tests/data/rubygems_dumps.xml
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/data/rubygems_dumps.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+ <Name>rubygems-dumps</Name>
+ <Prefix>production/public_postgresql</Prefix>
+ <Marker></Marker>
+ <MaxKeys>1000</MaxKeys>
+ <IsTruncated>false</IsTruncated>
+ <Contents>
+ <Key>production/public_postgresql/2022.10.05.06.10.11/public_postgresql.tar</Key>
+ <LastModified>2022-10-05T06:11:15.000Z</LastModified>
+ <ETag>"d1c447a2a490225c2d59061e60ed86e9-75"</ETag>
+ <Size>391653888</Size>
+ <StorageClass>STANDARD</StorageClass>
+ </Contents>
+ <Contents>
+ <Key>production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar</Key>
+ <LastModified>2022-10-06T06:11:11.000Z</LastModified>
+ <ETag>"2ccd9340e4f802ec982e4cd00db2d168-75"</ETag>
+ <Size>390047744</Size>
+ <StorageClass>STANDARD</StorageClass>
+ </Contents>
+</ListBucketResult>
\ No newline at end of file
diff --git a/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar b/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/rubygems/tests/data/small_rubygems_dump.sh b/swh/lister/rubygems/tests/data/small_rubygems_dump.sh
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/data/small_rubygems_dump.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# this script requires a PostgreSQL server running on host,
+# it enables to generate the rubygems_pgsql_dump.tar file used in tests data
+# which contains a very small subset of gems for testing purpose
+
+cd /tmp
+
+# download rubygems load-pg-dump utility script
+curl -O https://raw.githubusercontent.com/rubygems/rubygems.org/1c8cf7e079e56f709e7fc8f4b2398637e41815f2/script/load-pg-dump
+
+# download latest rubygems pgsql dump and load rubygems db in local pgsql server
+./load-pg-dump -c rubygems_dump.tar
+
+# remove all rows in the rubygems db not related to gem haar_joke or l33tify
+# those gems have few releases so that is why they have been picked
+# also drop tables not needed by the rubygems lister
+cleanup_script=$(cat <<- EOF
+with t as (
+ select id from rubygems where name = 'haar_joke'
+),
+t2 as (
+ select id from rubygems where name = 'l33tify'
+) delete from versions where rubygem_id != (select id from t) and rubygem_id != (select id from t2);
+
+delete from rubygems where name != 'haar_joke' and name != 'l33tify';
+
+drop table dependencies;
+drop table gem_downloads;
+drop table linksets;
+EOF
+)
+echo $cleanup_script | psql rubygems
+
+# create the rubygems_pgsql_dump.tar file
+mkdir -p public_postgresql/databases
+pg_dump rubygems | gzip -c > public_postgresql/databases/PostgreSQL.sql.gz
+tar -cvf rubygems_pgsql_dump.tar public_postgresql
diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py
--- a/swh/lister/rubygems/tests/test_lister.py
+++ b/swh/lister/rubygems/tests/test_lister.py
@@ -2,26 +2,153 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+
+# flake8: noqa: B950
+
+from pathlib import Path
+
+import iso8601
+import pytest
+
from swh.lister.rubygems.lister import RubyGemsLister
+from swh.scheduler.model import ListedOrigin
+
+DUMP_FILEPATH = "production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar"
-expected_origins = [
- "https://rubygems.org/gems/mercurial-ruby",
- "https://rubygems.org/gems/mercurial-wrapper",
- "https://rubygems.org/gems/mercurius",
-]
+@pytest.fixture
+def expected_listed_origins():
+ return [
+ {
+ "url": "https://rubygems.org/gems/haar_joke",
+ "visit_type": "rubygems",
+ "last_update": iso8601.parse_date("2016-11-05T00:00:00+00:00"),
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "url": "https://rubygems.org/downloads/haar_joke-0.0.2.gem",
+ "length": 8704,
+ "version": "0.0.2",
+ "filename": "haar_joke-0.0.2.gem",
+ "checksums": {
+ "sha256": "85a8cf5f41890e9605265eeebfe9e99aa0350a01a3c799f9f55a0615a31a2f5f"
+ },
+ },
+ {
+ "url": "https://rubygems.org/downloads/haar_joke-0.0.1.gem",
+ "length": 8704,
+ "version": "0.0.1",
+ "filename": "haar_joke-0.0.1.gem",
+ "checksums": {
+ "sha256": "a2ee7052fb8ffcfc4ec0fdb77fae9a36e473f859af196a36870a0f386b5ab55e"
+ },
+ },
+ ],
+ "rubygem_metadata": [
+ {
+ "date": "2016-11-05T00:00:00+00:00",
+ "authors": "Gemma Gotch",
+ "version": "0.0.2",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.2.json",
+ },
+ {
+ "date": "2016-07-23T00:00:00+00:00",
+ "authors": "Gemma Gotch",
+ "version": "0.0.1",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.1.json",
+ },
+ ],
+ },
+ },
+ {
+ "url": "https://rubygems.org/gems/l33tify",
+ "visit_type": "rubygems",
+ "last_update": iso8601.parse_date("2014-11-14T00:00:00+00:00"),
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "url": "https://rubygems.org/downloads/l33tify-0.0.2.gem",
+ "length": 6144,
+ "version": "0.0.2",
+ "filename": "l33tify-0.0.2.gem",
+ "checksums": {
+ "sha256": "0087a21fb6161bba8892df40de3b5e27404f941658084413b8fde49db2bc7c9f"
+ },
+ },
+ {
+ "url": "https://rubygems.org/downloads/l33tify-0.0.3.gem",
+ "length": 6144,
+ "version": "0.0.3",
+ "filename": "l33tify-0.0.3.gem",
+ "checksums": {
+ "sha256": "4502097ddf2657d561ce0f527ef1f49f1658c8a0968ab8cc853273138f8382a2"
+ },
+ },
+ {
+ "url": "https://rubygems.org/downloads/l33tify-0.0.1.gem",
+ "length": 6144,
+ "version": "0.0.1",
+ "filename": "l33tify-0.0.1.gem",
+ "checksums": {
+ "sha256": "5abfb737ce5cf561726f2f7cc1ba0f0e4f865f8b7283192e05eb3f246d3dbbca"
+ },
+ },
+ ],
+ "rubygem_metadata": [
+ {
+ "date": "2014-11-14T00:00:00+00:00",
+ "authors": "E Alexander Liedtke",
+ "version": "0.0.2",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.2.json",
+ },
+ {
+ "date": "2014-11-14T00:00:00+00:00",
+ "authors": "E Alexander Liedtke",
+ "version": "0.0.3",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.3.json",
+ },
+ {
+ "date": "2014-11-14T00:00:00+00:00",
+ "authors": "E Alexander Liedtke",
+ "version": "0.0.1",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.1.json",
+ },
+ ],
+ },
+ },
+ ]
-def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler):
+
+@pytest.fixture(autouse=True)
+def network_requests_mock(datadir, requests_mock):
+ requests_mock.get(
+ RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_LIST_URL,
+ content=Path(datadir, "rubygems_dumps.xml").read_bytes(),
+ )
+ content = Path(datadir, "rubygems_pgsql_dump.tar").read_bytes()
+ requests_mock.get(
+ f"{RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_BASE_URL}/{DUMP_FILEPATH}",
+ content=content,
+ headers={"content-length": str(len(content))},
+ )
+
+
+@pytest.mark.db
+def test_rubygems_lister(swh_scheduler, expected_listed_origins):
lister = RubyGemsLister(scheduler=swh_scheduler)
res = lister.run()
- assert res.pages == 3
- assert res.origins == 1 + 1 + 1
+ assert res.pages == 2
+ assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- assert len(scheduler_origins) == len(expected_origins)
-
- for origin in scheduler_origins:
- assert origin.visit_type == "rubygems"
- assert origin.url in expected_origins
+ assert [
+ {
+ "url": origin.url,
+ "visit_type": origin.visit_type,
+ "last_update": origin.last_update,
+ "extra_loader_arguments": origin.extra_loader_arguments,
+ }
+ for origin in scheduler_origins
+ ] == expected_listed_origins
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Dec 19, 6:34 AM (6 h, 36 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221150
Attached To
D8639: rubygems: Use gems database dump to improve listing output
Event Timeline
Log In to Comment