Page MenuHomeSoftware Heritage

D8529.diff
No OneTemporary

D8529.diff

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,7 @@
lister.pubdev=swh.lister.pubdev:register
lister.puppet=swh.lister.puppet:register
lister.pypi=swh.lister.pypi:register
+ lister.rubygems=swh.lister.rubygems:register
lister.sourceforge=swh.lister.sourceforge:register
lister.tuleap=swh.lister.tuleap:register
lister.maven=swh.lister.maven:register
diff --git a/swh/lister/rubygems/__init__.py b/swh/lister/rubygems/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+RubyGems lister
+===============
+
+The RubyGems lister list origins from `RubyGems.org`_, the Ruby community’s gem hosting service.
+
+As of September 2022 `RubyGems.org`_ list 173384 package names.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call an `http endpoint`_ which returns a list of gems
+as text.
+
+Page listing
+------------
+
+Each page returns an origin url based on the following pattern::
+
+ https://rubygems.org/gems/{pkgname}
+
+Origins from page
+-----------------
+
+The lister yields one origin url per page.
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/rubygems/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker compose up -d
+
+Then schedule a RubyGems listing task::
+
+ docker compose exec swh-scheduler swh scheduler task add -p oneshot list-rubygems
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+ docker compose logs -f swh-lister
+
+.. _RubyGems.org: https://rubygems.org/
+.. _http endpoint: https://rubygems.org/versions
+"""
+
+
+def register():
+ from .lister import RubyGemsLister
+
+ return {
+ "lister": RubyGemsLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/lister.py
@@ -0,0 +1,75 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+from typing import Iterator, List, Optional, Text
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+RubyGemsListerPage = Text
+
+
+class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
+ """Lister for RubyGems.org, the Ruby community’s gem hosting service."""
+
+ LISTER_NAME = "rubygems"
+ VISIT_TYPE = "rubygems"
+ INSTANCE = "rubygems"
+
+ INDEX_URL = "https://rubygems.org/versions"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.INDEX_URL,
+ )
+
+ def get_pages(self) -> Iterator[RubyGemsListerPage]:
+ """Yield an iterator which returns 'page'
+
+ It uses the index file located at `https://rubygems.org/versions`
+ to get a list of package names. Each page returns an origin url based on
+ the following pattern::
+
+ https://rubygems.org/gems/{pkgname}
+
+ """
+
+ package_names: List[str] = []
+ response = self.http_request(url=self.url)
+ data = response.content.decode()
+
+ # remove the first 3 lines (file headers + first package named '-')
+ for line in data.splitlines()[3:]:
+ package_names.append(line.split(" ")[0])
+
+ # Remove duplicates
+ package_names_set: List[str] = list(set(package_names))
+
+ for pkgname in package_names_set:
+ yield f"https://rubygems.org/gems/{pkgname}"
+
+ def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=page,
+ last_update=None,
+ )
diff --git a/swh/lister/rubygems/tasks.py b/swh/lister/rubygems/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.rubygems.lister import RubyGemsLister
+
+
+@shared_task(name=__name__ + ".RubyGemsListerTask")
+def list_rubygems(**lister_args):
+ """Lister task for RubyGems"""
+ return RubyGemsLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/rubygems/tests/__init__.py b/swh/lister/rubygems/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/data/https_rubygems.org/versions
@@ -0,0 +1,6 @@
+created_at: 2022-09-01T00:00:05Z
+---
+- 1 05d0116933ba44b0b5d0ee19bfd35ccc
+mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260
+mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22
+mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd
diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/test_lister.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.rubygems.lister import RubyGemsLister
+
+expected_origins = [
+ "https://rubygems.org/gems/mercurial-ruby",
+ "https://rubygems.org/gems/mercurial-wrapper",
+ "https://rubygems.org/gems/mercurius",
+]
+
+
+def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler):
+ lister = RubyGemsLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 1 + 1 + 1
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ for origin in scheduler_origins:
+ assert origin.visit_type == "rubygems"
+ assert origin.url in expected_origins
diff --git a/swh/lister/rubygems/tests/test_tasks.py b/swh/lister/rubygems/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/rubygems/tests/test_tasks.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_rubygems_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.rubygems.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_rubygems_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked RubyGemsLister
+ lister = mocker.patch("swh.lister.rubygems.tasks.RubyGemsLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.lister.rubygems.tasks.RubyGemsListerTask"
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 9:25 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219674

Event Timeline