diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -42,3 +42,9 @@ [mypy-dulwich.*] ignore_missing_imports = True + +[mypy-testing.postgresql.*] +ignore_missing_imports = True + +[mypy-psycopg2.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ tenacity >= 6.2 lxml dulwich +testing.postgresql +psycopg2 diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -3,8 +3,20 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import base64 +from datetime import timezone +import gzip import logging -from typing import Iterator, List, Optional, Text +import os +import shutil +import subprocess +import tarfile +import tempfile +from typing import Any, Dict, Iterator, Optional, Tuple + +from bs4 import BeautifulSoup +import psycopg2 +from testing.postgresql import Postgresql from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -13,18 +25,39 @@ logger = logging.getLogger(__name__) -# Aliasing the page results returned by `get_pages` method from the lister. -RubyGemsListerPage = Text +RubyGemsListerPage = Dict[str, Any] class RubyGemsLister(StatelessLister[RubyGemsListerPage]): - """Lister for RubyGems.org, the Ruby community’s gem hosting service.""" + """Lister for RubyGems.org, the Ruby community's gem hosting service. + + Instead of querying rubygems.org Web API, it uses gems data from the + daily PostreSQL database dump of rubygems. It enables to gather all + interesting info about a gem and its release artifacts (version number, + download URL, checksums, release date) in an efficient way and without + flooding rubygems Web API with numerous HTTP requests (as there is more + than 187000 gems available on 07/10/2022). + """ LISTER_NAME = "rubygems" VISIT_TYPE = "rubygems" INSTANCE = "rubygems" - INDEX_URL = "https://rubygems.org/versions" + RUBY_GEMS_POSTGRES_DUMP_BASE_URL = ( + "https://s3-us-west-2.amazonaws.com/rubygems-dumps" + ) + RUBY_GEMS_POSTGRES_DUMP_LIST_URL = ( + f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql" + ) + + RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem" + RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}" + RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = ( + "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json" + ) + + DB_NAME = "rubygems" + DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz" def __init__( self, @@ -35,41 +68,147 @@ scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, - url=self.INDEX_URL, + url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, ) - def get_pages(self) -> Iterator[RubyGemsListerPage]: - """Yield an iterator which returns 'page' - - It uses the index file located at `https://rubygems.org/versions` - to get a list of package names. Each page returns an origin url based on - the following pattern:: - - https://rubygems.org/gems/{pkgname} + def get_latest_dump_file(self) -> str: + response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL) + xml = BeautifulSoup(response.content, "xml") + contents = xml.find_all("Contents") + return contents[-1].find("Key").text + + def create_rubygems_db( + self, postgresql: Postgresql + ) -> Tuple[str, psycopg2._psycopg.connection]: + logger.debug("Creating rubygems database") + + db_dsn = postgresql.dsn() + db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME) + db = psycopg2.connect(**db_dsn) + db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + with db.cursor() as cursor: + cursor.execute(f"CREATE DATABASE {self.DB_NAME}") + + db_dsn["database"] = self.DB_NAME + + db = psycopg2.connect(**db_dsn) + db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + with db.cursor() as cursor: + cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore") + + return db_url, db + + def populate_rubygems_db(self, db_url: str): + dump_file = self.get_latest_dump_file() + dump_id = dump_file.split("/")[2] + + response = self.http_request(f"{self.url}/{dump_file}", stream=True) + + with tempfile.TemporaryDirectory() as temp_dir: + logger.debug( + "Downloading latest rubygems database dump: %s (%s bytes)", + dump_id, + response.headers["content-length"], + ) + dump_file = os.path.join(temp_dir, "rubygems_dump.tar") + with open(dump_file, "wb") as dump: + for chunk in response.iter_content(chunk_size=1024): + dump.write(chunk) + + with tarfile.open(dump_file) as dump_tar: + dump_tar.extractall(temp_dir) + + logger.debug("Populating rubygems database with dump %s", dump_id) + psql = subprocess.Popen( + ["psql", "-q", db_url], + stdin=subprocess.PIPE, + ) + + # passing value of gzip.open as stdin of subprocess.run makes the process + # read raw data instead of decompressed data so we have to use a pipe + with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql: + shutil.copyfileobj(sql, psql.stdin) # type: ignore + + # denote end of read file + psql.stdin.close() # type: ignore + psql.wait() - """ - - package_names: List[str] = [] - response = self.http_request(url=self.url) - data = response.content.decode() - - # remove the first 3 lines (file headers + first package named '-') - for line in data.splitlines()[3:]: - package_names.append(line.split(" ")[0]) - - # Remove duplicates - package_names_set: List[str] = list(set(package_names)) - - for pkgname in package_names_set: - yield f"https://rubygems.org/gems/{pkgname}" + def get_pages(self) -> Iterator[RubyGemsListerPage]: + # spawn a temporary postgres instance (require initdb executable in environment) + with Postgresql() as postgresql: + db_url, db = self.create_rubygems_db(postgresql) + self.populate_rubygems_db(db_url) + + with db.cursor() as cursor: + cursor.execute("SELECT id, name from rubygems") + for gem_id, gem_name in cursor.fetchall(): + logger.debug("Processing gem named %s", gem_name[1]) + with db.cursor() as cursor_v: + cursor_v.execute( + "SELECT authors, built_at, number, sha256, size from versions " + "where rubygem_id = %s", + (gem_id,), + ) + versions = [ + { + "number": number, + "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format( + gem=gem_name, version=number + ), + "date": built_at.replace(tzinfo=timezone.utc), + "authors": authors, + "sha256": ( + base64.decodebytes(sha256.encode()).hex() + if sha256 + else None + ), + "size": size, + } + for authors, built_at, number, sha256, size in cursor_v.fetchall() + ] + if versions: + yield { + "name": gem_name, + "versions": versions, + } def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: - """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None + artifacts = [] + rubygem_metadata = [] + for version in page["versions"]: + artifacts.append( + { + "version": version["number"], + "filename": version["url"].split("/")[-1], + "url": version["url"], + "checksums": ( + {"sha256": version["sha256"]} if version["sha256"] else {} + ), + "length": version["size"], + } + ) + rubygem_metadata.append( + { + "version": version["number"], + "date": version["date"].isoformat(), + "authors": version["authors"], + "extrinsic_metadata_url": ( + self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format( + gem=page["name"], version=version["number"] + ) + ), + } + ) + yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=page, - last_update=None, + url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]), + last_update=max(version["date"] for version in page["versions"]), + extra_loader_arguments={ + "artifacts": artifacts, + "rubygem_metadata": rubygem_metadata, + }, ) diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions deleted file mode 100644 --- a/swh/lister/rubygems/tests/data/https_rubygems.org/versions +++ /dev/null @@ -1,6 +0,0 @@ -created_at: 2022-09-01T00:00:05Z ---- -- 1 05d0116933ba44b0b5d0ee19bfd35ccc -mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260 -mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22 -mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd diff --git a/swh/lister/rubygems/tests/data/rubygems_dumps.xml b/swh/lister/rubygems/tests/data/rubygems_dumps.xml new file mode 100644 --- /dev/null +++ b/swh/lister/rubygems/tests/data/rubygems_dumps.xml @@ -0,0 +1,22 @@ + + + rubygems-dumps + production/public_postgresql + + 1000 + false + + production/public_postgresql/2022.10.05.06.10.11/public_postgresql.tar + 2022-10-05T06:11:15.000Z + "d1c447a2a490225c2d59061e60ed86e9-75" + 391653888 + STANDARD + + + production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar + 2022-10-06T06:11:11.000Z + "2ccd9340e4f802ec982e4cd00db2d168-75" + 390047744 + STANDARD + + \ No newline at end of file diff --git a/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar b/swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ public_postgresql/databases/PostgreSQL.sql.gz +tar -cvf rubygems_pgsql_dump.tar public_postgresql diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py --- a/swh/lister/rubygems/tests/test_lister.py +++ b/swh/lister/rubygems/tests/test_lister.py @@ -2,26 +2,153 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + +# flake8: noqa: B950 + +from pathlib import Path + +import iso8601 +import pytest + from swh.lister.rubygems.lister import RubyGemsLister +from swh.scheduler.model import ListedOrigin + +DUMP_FILEPATH = "production/public_postgresql/2022.10.06.06.10.05/public_postgresql.tar" -expected_origins = [ - "https://rubygems.org/gems/mercurial-ruby", - "https://rubygems.org/gems/mercurial-wrapper", - "https://rubygems.org/gems/mercurius", -] +@pytest.fixture +def expected_listed_origins(): + return [ + { + "url": "https://rubygems.org/gems/haar_joke", + "visit_type": "rubygems", + "last_update": iso8601.parse_date("2016-11-05T00:00:00+00:00"), + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://rubygems.org/downloads/haar_joke-0.0.2.gem", + "length": 8704, + "version": "0.0.2", + "filename": "haar_joke-0.0.2.gem", + "checksums": { + "sha256": "85a8cf5f41890e9605265eeebfe9e99aa0350a01a3c799f9f55a0615a31a2f5f" + }, + }, + { + "url": "https://rubygems.org/downloads/haar_joke-0.0.1.gem", + "length": 8704, + "version": "0.0.1", + "filename": "haar_joke-0.0.1.gem", + "checksums": { + "sha256": "a2ee7052fb8ffcfc4ec0fdb77fae9a36e473f859af196a36870a0f386b5ab55e" + }, + }, + ], + "rubygem_metadata": [ + { + "date": "2016-11-05T00:00:00+00:00", + "authors": "Gemma Gotch", + "version": "0.0.2", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.2.json", + }, + { + "date": "2016-07-23T00:00:00+00:00", + "authors": "Gemma Gotch", + "version": "0.0.1", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/haar_joke/versions/0.0.1.json", + }, + ], + }, + }, + { + "url": "https://rubygems.org/gems/l33tify", + "visit_type": "rubygems", + "last_update": iso8601.parse_date("2014-11-14T00:00:00+00:00"), + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://rubygems.org/downloads/l33tify-0.0.2.gem", + "length": 6144, + "version": "0.0.2", + "filename": "l33tify-0.0.2.gem", + "checksums": { + "sha256": "0087a21fb6161bba8892df40de3b5e27404f941658084413b8fde49db2bc7c9f" + }, + }, + { + "url": "https://rubygems.org/downloads/l33tify-0.0.3.gem", + "length": 6144, + "version": "0.0.3", + "filename": "l33tify-0.0.3.gem", + "checksums": { + "sha256": "4502097ddf2657d561ce0f527ef1f49f1658c8a0968ab8cc853273138f8382a2" + }, + }, + { + "url": "https://rubygems.org/downloads/l33tify-0.0.1.gem", + "length": 6144, + "version": "0.0.1", + "filename": "l33tify-0.0.1.gem", + "checksums": { + "sha256": "5abfb737ce5cf561726f2f7cc1ba0f0e4f865f8b7283192e05eb3f246d3dbbca" + }, + }, + ], + "rubygem_metadata": [ + { + "date": "2014-11-14T00:00:00+00:00", + "authors": "E Alexander Liedtke", + "version": "0.0.2", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.2.json", + }, + { + "date": "2014-11-14T00:00:00+00:00", + "authors": "E Alexander Liedtke", + "version": "0.0.3", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.3.json", + }, + { + "date": "2014-11-14T00:00:00+00:00", + "authors": "E Alexander Liedtke", + "version": "0.0.1", + "extrinsic_metadata_url": "https://rubygems.org/api/v2/rubygems/l33tify/versions/0.0.1.json", + }, + ], + }, + }, + ] -def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler): + +@pytest.fixture(autouse=True) +def network_requests_mock(datadir, requests_mock): + requests_mock.get( + RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_LIST_URL, + content=Path(datadir, "rubygems_dumps.xml").read_bytes(), + ) + content = Path(datadir, "rubygems_pgsql_dump.tar").read_bytes() + requests_mock.get( + f"{RubyGemsLister.RUBY_GEMS_POSTGRES_DUMP_BASE_URL}/{DUMP_FILEPATH}", + content=content, + headers={"content-length": str(len(content))}, + ) + + +@pytest.mark.db +def test_rubygems_lister(swh_scheduler, expected_listed_origins): lister = RubyGemsLister(scheduler=swh_scheduler) res = lister.run() - assert res.pages == 3 - assert res.origins == 1 + 1 + 1 + assert res.pages == 2 + assert res.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == len(expected_origins) - - for origin in scheduler_origins: - assert origin.visit_type == "rubygems" - assert origin.url in expected_origins + assert [ + { + "url": origin.url, + "visit_type": origin.visit_type, + "last_update": origin.last_update, + "extra_loader_arguments": origin.extra_loader_arguments, + } + for origin in scheduler_origins + ] == expected_listed_origins