Page MenuHomeSoftware Heritage

D8639.diff
No OneTemporary

D8639.diff

diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -42,3 +42,9 @@
[mypy-dulwich.*]
ignore_missing_imports = True
+
+[mypy-testing.postgresql.*]
+ignore_missing_imports = True
+
+[mypy-psycopg2.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@
tenacity >= 6.2
lxml
dulwich
+testing.postgresql
+psycopg2
diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py
--- a/swh/lister/rubygems/lister.py
+++ b/swh/lister/rubygems/lister.py
@@ -3,8 +3,20 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import base64
+from datetime import timezone
+import gzip
import logging
-from typing import Iterator, List, Optional, Text
+import os
+import shutil
+import subprocess
+import tarfile
+import tempfile
+from typing import Any, Dict, Iterator, Optional, Tuple
+
+from bs4 import BeautifulSoup
+import psycopg2
+from testing.postgresql import Postgresql
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -13,18 +25,39 @@
logger = logging.getLogger(__name__)
-# Aliasing the page results returned by `get_pages` method from the lister.
-RubyGemsListerPage = Text
+RubyGemsListerPage = Dict[str, Any]
class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
- """Lister for RubyGems.org, the Ruby community’s gem hosting service."""
+ """Lister for RubyGems.org, the Ruby community's gem hosting service.
+
+ Instead of querying rubygems.org Web API, it uses gems data from the
+ daily PostreSQL database dump of rubygems. It enables to gather all
+ interesting info about a gem and its release artifacts (version number,
+ download URL, checksums, release date) in an efficient way and without
+ flooding rubygems Web API with numerous HTTP requests (as there is more
+ than 187000 gems available on 07/10/2022).
+ """
LISTER_NAME = "rubygems"
VISIT_TYPE = "rubygems"
INSTANCE = "rubygems"
- INDEX_URL = "https://rubygems.org/versions"
+ RUBY_GEMS_POSTGRES_DUMP_BASE_URL = (
+ "https://s3-us-west-2.amazonaws.com/rubygems-dumps"
+ )
+ RUBY_GEMS_POSTGRES_DUMP_LIST_URL = (
+ f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql"
+ )
+
+ RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem"
+ RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}"
+ RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = (
+ "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json"
+ )
+
+ DB_NAME = "rubygems"
+ DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz"
def __init__(
self,
@@ -35,41 +68,147 @@
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
- url=self.INDEX_URL,
+ url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL,
)
- def get_pages(self) -> Iterator[RubyGemsListerPage]:
- """Yield an iterator which returns 'page'
-
- It uses the index file located at `https://rubygems.org/versions`
- to get a list of package names. Each page returns an origin url based on
- the following pattern::
-
- https://rubygems.org/gems/{pkgname}
+ def get_latest_dump_file(self) -> str:
+ response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
+ xml = BeautifulSoup(response.content, "xml")
+ contents = xml.find_all("Contents")
+ return contents[-1].find("Key").text
+
+ def create_rubygems_db(
+ self, postgresql: Postgresql
+ ) -> Tuple[str, psycopg2._psycopg.connection]:
+ logger.debug("Creating rubygems database")
+
+ db_dsn = postgresql.dsn()
+ db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
+ db = psycopg2.connect(**db_dsn)
+ db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+ with db.cursor() as cursor:
+ cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
+
+ db_dsn["database"] = self.DB_NAME
+
+ db = psycopg2.connect(**db_dsn)
+ db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+ with db.cursor() as cursor:
+ cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")
+
+ return db_url, db
+
+ def populate_rubygems_db(self, db_url: str):
+ dump_file = self.get_latest_dump_file()
+ dump_id = dump_file.split("/")[2]
+
+ response = self.http_request(f"{self.url}/{dump_file}", stream=True)
+
+ with tempfile.TemporaryDirectory() as temp_dir:
+ logger.debug(
+ "Downloading latest rubygems database dump: %s (%s bytes)",
+ dump_id,
+ response.headers["content-length"],
+ )
+ dump_file = os.path.join(temp_dir, "rubygems_dump.tar")
+ with open(dump_file, "wb") as dump:
+ for chunk in response.iter_content(chunk_size=1024):
+ dump.write(chunk)
+
+ with tarfile.open(dump_file) as dump_tar:
+ dump_tar.extractall(temp_dir)
+
+ logger.debug("Populating rubygems database with dump %s", dump_id)
+ psql = subprocess.Popen(
+ ["psql", "-q", db_url],
+ stdin=subprocess.PIPE,
+ )
+
+ # passing value of gzip.open as stdin of subprocess.run makes the process
+ # read raw data instead of decompressed data so we have to use a pipe
+ with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql:
+ shutil.copyfileobj(sql, psql.stdin) # type: ignore
+
+ # denote end of read file
+ psql.stdin.close() # type: ignore
+ psql.wait()
- """
-
- package_names: List[str] = []
- response = self.http_request(url=self.url)
- data = response.content.decode()
-
- # remove the first 3 lines (file headers + first package named '-')
- for line in data.splitlines()[3:]:
- package_names.append(line.split(" ")[0])
-
- # Remove duplicates
- package_names_set: List[str] = list(set(package_names))
-
- for pkgname in package_names_set:
- yield f"https://rubygems.org/gems/{pkgname}"
+ def get_pages(self) -> Iterator[RubyGemsListerPage]:
+ # spawn a temporary postgres instance (require initdb executable in environment)
+ with Postgresql() as postgresql:
+ db_url, db = self.create_rubygems_db(postgresql)
+ self.populate_rubygems_db(db_url)
+
+ with db.cursor() as cursor:
+ cursor.execute("SELECT id, name from rubygems")
+ for gem_id, gem_name in cursor.fetchall():
+ logger.debug("Processing gem named %s", gem_name[1])
+ with db.cursor() as cursor_v:
+ cursor_v.execute(
+ "SELECT authors, built_at, number, sha256, size from versions "
+ "where rubygem_id = %s",
+ (gem_id,),
+ )
+ versions = [
+ {
+ "number": number,
+ "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format(
+ gem=gem_name, version=number
+ ),
+ "date": built_at.replace(tzinfo=timezone.utc),
+ "authors": authors,
+ "sha256": (
+ base64.decodebytes(sha256.encode()).hex()
+ if sha256
+ else None
+ ),
+ "size": size,
+ }
+ for authors, built_at, number, sha256, size in cursor_v.fetchall()
+ ]
+ if versions:
+ yield {
+ "name": gem_name,
+ "versions": versions,
+ }
def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
- """Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
+ artifacts = []
+ rubygem_metadata = []
+ for version in page["versions"]:
+ artifacts.append(
+ {
+ "version": version["number"],
+ "filename": version["url"].split("/")[-1],
+ "url": version["url"],
+ "checksums": (
+ {"sha256": version["sha256"]} if version["sha256"] else {}
+ ),
+ "length": version["size"],
+ }
+ )
+ rubygem_metadata.append(
+ {
+ "version": version["number"],
+ "date": version["date"].isoformat(),
+ "authors": version["authors"],
+ "extrinsic_metadata_url": (
+ self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format(
+ gem=page["name"], version=version["number"]
+ )
+ ),
+ }
+ )
+
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
- url=page,
- last_update=None,
+ url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]),
+ last_update=max(version["date"] for version in page["versions"]),
+ extra_loader_arguments={
+ "artifacts": artifacts,
+ "rubygem_metadata": rubygem_metadata,
+ },
)
diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions
deleted file mode 100644
--- a/swh/lister/rubygems/tests/data/https_rubygems.org/versions
+++ /dev/null
@@ -1,6 +0,0 @@
-created_at: 2022-09-01T00:00:05Z
----
-- 1 05d0116933ba44b0b5d0ee19bfd35ccc
-mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260
-mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22
-mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd
diff --git a/swh/lister/rubygems/tests/data/rubygems_dumps.xml b/swh/lister/rubygems/tests/data/rubygems_dumps.xml
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/data/rubygems_dumps.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+ <Name>rubygems-dumps</Name>
+ <Prefix>production/public_postgresql</Prefix>
+ <Marker></Marker>
+ <MaxKeys>1000</MaxKeys>
+ <IsTruncated>false</IsTruncated>
+ <Contents>
+ <Key>production/public_postgresql/2022.10.05.06.10.11/public_postgresql.tar</Key>
+ <LastModified>2022-10-05T06:11:15.000Z</LastModified>
+ <ETag>&quot;d1c447a2a490225c2d59061e60ed86e9-75&quot;</ETag>
+ <Size>391653888</Size>
+ <StorageClass>STANDARD</StorageClass>
+ </Contents>
+ <Contents>
+ <Key>production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar</Key>
+ <LastModified>2022-10-06T06:11:11.000Z</LastModified>
+ <ETag>&quot;2ccd9340e4f802ec982e4cd00db2d168-75&quot;</ETag>
+ <Size>390047744</Size>
+ <StorageClass>STANDARD</StorageClass>
+ </Contents>
+</ListBucketResult>
\ No newline at end of file
diff --git a/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar b/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/rubygems/tests/data/small_rubygems_dump.sh b/swh/lister/rubygems/tests/data/small_rubygems_dump.sh
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/data/small_rubygems_dump.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# this script requires a PostgreSQL server running on host,
+# it enables to generate the rubygems_pgsql_dump.tar file used in tests data
+# which contains a very small subset of gems for testing purpose
+
+cd /tmp
+
+# download rubygems load-pg-dump utility script
+curl -O https://raw.githubusercontent.com/rubygems/rubygems.org/1c8cf7e079e56f709e7fc8f4b2398637e41815f2/script/load-pg-dump
+
+# download latest rubygems pgsql dump and load rubygems db in local pgsql server
+./load-pg-dump -c rubygems_dump.tar
+
+# remove all rows in the rubygems db not related to gem haar_joke or l33tify
+# those gems have few releases so that is why they have been picked
+# also drop tables not needed by the rubygems lister
+cleanup_script=$(cat <<- EOF
+with t as (
+ select id from rubygems where name = 'haar_joke'
+),
+t2 as (
+ select id from rubygems where name = 'l33tify'
+) delete from versions where rubygem_id != (select id from t) and rubygem_id != (select id from t2);
+
+delete from rubygems where name != 'haar_joke' and name != 'l33tify';
+
+drop table dependencies;
+drop table gem_downloads;
+drop table linksets;
+EOF
+)
+echo $cleanup_script | psql rubygems
+
+# create the rubygems_pgsql_dump.tar file
+mkdir -p public_postgresql/databases
+pg_dump rubygems | gzip -c > public_postgresql/databases/PostgreSQL.sql.gz
+tar -cvf rubygems_pgsql_dump.tar public_postgresql
diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py
--- a/swh/lister/rubygems/tests/test_lister.py
+++ b/swh/lister/rubygems/tests/test_lister.py
@@ -2,26 +2,153 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+
+# flake8: noqa: B950
+
+from pathlib import Path
+
+import iso8601
+import pytest
+
from swh.lister.rubygems.lister import RubyGemsLister
+from swh.scheduler.model import ListedOrigin
+
+DUMP_FILEPATH = "production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar"
-expected_origins = [
- "https://rubygems.org/gems/mercurial-ruby",
- "https://rubygems.org/gems/mercurial-wrapper",
- "https://rubygems.org/gems/mercurius",
-]
+@pytest.fixture
+def expected_listed_origins():
+ return [
+ {
+ "url": "https://rubygems.org/gems/haar_joke",
+ "visit_type": "rubygems",
+ "last_update": iso8601.parse_date("2016-11-05T00:00:00+00:00"),
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "url": "https://rubygems.org/downloads/haar_joke-0.0.2.gem",
+ "length": 8704,
+ "version": "0.0.2",
+ "filename": "haar_joke-0.0.2.gem",
+ "checksums": {
+ "sha256": "85a8cf5f41890e9605265eeebfe9e99aa0350a01a3c799f9f55a0615a31a2f5f"
+ },
+ },
+ {
+ "url": "https://rubygems.org/downloads/haar_joke-0.0.1.gem",
+ "length": 8704,
+ "version": "0.0.1",
+ "filename": "haar_joke-0.0.1.gem",
+ "checksums": {
+ "sha256": "a2ee7052fb8ffcfc4ec0fdb77fae9a36e473f859af196a36870a0f386b5ab55e"
+ },
+ },
+ ],
+ "rubygem_metadata": [
+ {
+ "date": "2016-11-05T00:00:00+00:00",
+ "authors": "Gemma Gotch",
+ "version": "0.0.2",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.2.json",
+ },
+ {
+ "date": "2016-07-23T00:00:00+00:00",
+ "authors": "Gemma Gotch",
+ "version": "0.0.1",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.1.json",
+ },
+ ],
+ },
+ },
+ {
+ "url": "https://rubygems.org/gems/l33tify",
+ "visit_type": "rubygems",
+ "last_update": iso8601.parse_date("2014-11-14T00:00:00+00:00"),
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "url": "https://rubygems.org/downloads/l33tify-0.0.2.gem",
+ "length": 6144,
+ "version": "0.0.2",
+ "filename": "l33tify-0.0.2.gem",
+ "checksums": {
+ "sha256": "0087a21fb6161bba8892df40de3b5e27404f941658084413b8fde49db2bc7c9f"
+ },
+ },
+ {
+ "url": "https://rubygems.org/downloads/l33tify-0.0.3.gem",
+ "length": 6144,
+ "version": "0.0.3",
+ "filename": "l33tify-0.0.3.gem",
+ "checksums": {
+ "sha256": "4502097ddf2657d561ce0f527ef1f49f1658c8a0968ab8cc853273138f8382a2"
+ },
+ },
+ {
+ "url": "https://rubygems.org/downloads/l33tify-0.0.1.gem",
+ "length": 6144,
+ "version": "0.0.1",
+ "filename": "l33tify-0.0.1.gem",
+ "checksums": {
+ "sha256": "5abfb737ce5cf561726f2f7cc1ba0f0e4f865f8b7283192e05eb3f246d3dbbca"
+ },
+ },
+ ],
+ "rubygem_metadata": [
+ {
+ "date": "2014-11-14T00:00:00+00:00",
+ "authors": "E Alexander Liedtke",
+ "version": "0.0.2",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.2.json",
+ },
+ {
+ "date": "2014-11-14T00:00:00+00:00",
+ "authors": "E Alexander Liedtke",
+ "version": "0.0.3",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.3.json",
+ },
+ {
+ "date": "2014-11-14T00:00:00+00:00",
+ "authors": "E Alexander Liedtke",
+ "version": "0.0.1",
+ "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.1.json",
+ },
+ ],
+ },
+ },
+ ]
-def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler):
+
+@pytest.fixture(autouse=True)
+def network_requests_mock(datadir, requests_mock):
+ requests_mock.get(
+ RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_LIST_URL,
+ content=Path(datadir, "rubygems_dumps.xml").read_bytes(),
+ )
+ content = Path(datadir, "rubygems_pgsql_dump.tar").read_bytes()
+ requests_mock.get(
+ f"{RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_BASE_URL}/{DUMP_FILEPATH}",
+ content=content,
+ headers={"content-length": str(len(content))},
+ )
+
+
+@pytest.mark.db
+def test_rubygems_lister(swh_scheduler, expected_listed_origins):
lister = RubyGemsLister(scheduler=swh_scheduler)
res = lister.run()
- assert res.pages == 3
- assert res.origins == 1 + 1 + 1
+ assert res.pages == 2
+ assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- assert len(scheduler_origins) == len(expected_origins)
-
- for origin in scheduler_origins:
- assert origin.visit_type == "rubygems"
- assert origin.url in expected_origins
+ assert [
+ {
+ "url": origin.url,
+ "visit_type": origin.visit_type,
+ "last_update": origin.last_update,
+ "extra_loader_arguments": origin.extra_loader_arguments,
+ }
+ for origin in scheduler_origins
+ ] == expected_listed_origins

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 6:34 AM (9 h, 45 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221150

Event Timeline