Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124171
D8529.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D8529.diff
View Options
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,7 @@
lister.pubdev=swh.lister.pubdev:register
lister.puppet=swh.lister.puppet:register
lister.pypi=swh.lister.pypi:register
+ lister.rubygems=swh.lister.rubygems:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
lister.maven=swh.lister.maven:register
diff --git a/swh/lister/rubygems/__init__.py b/swh/lister/rubygems/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+RubyGems lister
+===============
+
+The RubyGems lister list origins from `RubyGems.org`_, the Ruby community’s gem hosting service.
+
+As of September 2022 `RubyGems.org`_ list 173384 package names.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call an `http endpoint`_ which returns a list of gems
+as text.
+
+Page listing
+------------
+
+Each page returns an origin url based on the following pattern::
+
+ https://rubygems.org/gems/{pkgname}
+
+Origins from page
+-----------------
+
+The lister yields one origin url per page.
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/rubygems/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker compose up -d
+
+Then schedule a RubyGems listing task::
+
+ docker compose exec swh-scheduler swh scheduler task add -p oneshot list-rubygems
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+ docker compose logs -f swh-lister
+
+.. _RubyGems.org: https://rubygems.org/
+.. _http endpoint: https://rubygems.org/versions
+"""
+
+
+def register():
+ from .lister import RubyGemsLister
+
+ return {
+ "lister": RubyGemsLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/lister.py
@@ -0,0 +1,75 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+from typing import Iterator, List, Optional, Text
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+RubyGemsListerPage = Text
+
+
+class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
+ """Lister for RubyGems.org, the Ruby community’s gem hosting service."""
+
+ LISTER_NAME = "rubygems"
+ VISIT_TYPE = "rubygems"
+ INSTANCE = "rubygems"
+
+ INDEX_URL = "https://rubygems.org/versions"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.INDEX_URL,
+ )
+
+ def get_pages(self) -> Iterator[RubyGemsListerPage]:
+ """Yield an iterator which returns 'page'
+
+ It uses the index file located at `https://rubygems.org/versions`
+ to get a list of package names. Each page returns an origin url based on
+ the following pattern::
+
+ https://rubygems.org/gems/{pkgname}
+
+ """
+
+ package_names: List[str] = []
+ response = self.http_request(url=self.url)
+ data = response.content.decode()
+
+ # remove the first 3 lines (file headers + first package named '-')
+ for line in data.splitlines()[3:]:
+ package_names.append(line.split(" ")[0])
+
+ # Remove duplicates
+ package_names_set: List[str] = list(set(package_names))
+
+ for pkgname in package_names_set:
+ yield f"https://rubygems.org/gems/{pkgname}"
+
+ def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=page,
+ last_update=None,
+ )
diff --git a/swh/lister/rubygems/tasks.py b/swh/lister/rubygems/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.rubygems.lister import RubyGemsLister
+
+
+@shared_task(name=__name__ + ".RubyGemsListerTask")
+def list_rubygems(**lister_args):
+ """Lister task for RubyGems"""
+ return RubyGemsLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/rubygems/tests/__init__.py b/swh/lister/rubygems/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/data/https_rubygems.org/versions
@@ -0,0 +1,6 @@
+created_at: 2022-09-01T00:00:05Z
+---
+- 1 05d0116933ba44b0b5d0ee19bfd35ccc
+mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260
+mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22
+mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd
diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/test_lister.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.rubygems.lister import RubyGemsLister
+
+expected_origins = [
+ "https://rubygems.org/gems/mercurial-ruby",
+ "https://rubygems.org/gems/mercurial-wrapper",
+ "https://rubygems.org/gems/mercurius",
+]
+
+
+def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler):
+ lister = RubyGemsLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 1 + 1 + 1
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ for origin in scheduler_origins:
+ assert origin.visit_type == "rubygems"
+ assert origin.url in expected_origins
diff --git a/swh/lister/rubygems/tests/test_tasks.py b/swh/lister/rubygems/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/test_tasks.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_rubygems_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.rubygems.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_rubygems_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked RubyGemsLister
+ lister = mocker.patch("swh.lister.rubygems.tasks.RubyGemsLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.rubygems.tasks.RubyGemsListerTask"
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 9:25 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219674
Attached To
D8529: RubyGems: List origins from https://rubygems.org
Event Timeline
Log In to Comment