diff --git a/setup.py b/setup.py index ecc42ec..54ea4b0 100755 --- a/setup.py +++ b/setup.py @@ -1,101 +1,102 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.lister", description="Software Heritage lister", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], extras_require={"testing": parse_requirements("test")}, use_scm_version=True, include_package_data=True, entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register lister.conda=swh.lister.conda:register lister.cpan=swh.lister.cpan:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register lister.golang=swh.lister.golang:register lister.hackage=swh.lister.hackage:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.nuget=swh.lister.nuget:register lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pubdev=swh.lister.pubdev:register lister.puppet=swh.lister.puppet:register lister.pypi=swh.lister.pypi:register + lister.rubygems=swh.lister.rubygems:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-lister", "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/", }, ) diff --git a/swh/lister/rubygems/__init__.py b/swh/lister/rubygems/__init__.py new file mode 100644 index 0000000..3435e18 --- /dev/null +++ b/swh/lister/rubygems/__init__.py @@ -0,0 +1,66 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +RubyGems lister +=============== + +The RubyGems lister list origins from `RubyGems.org`_, the Ruby community’s gem hosting service. + +As of September 2022 `RubyGems.org`_ list 173384 package names. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call an `http endpoint`_ which returns a list of gems +as text. + +Page listing +------------ + +Each page returns an origin url based on the following pattern:: + + https://rubygems.org/gems/{pkgname} + +Origins from page +----------------- + +The lister yields one origin url per page. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/rubygems/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker compose up -d + +Then schedule a RubyGems listing task:: + + docker compose exec swh-scheduler swh scheduler task add -p oneshot list-rubygems + +You can follow lister execution by displaying logs of swh-lister service:: + + docker compose logs -f swh-lister + +.. _RubyGems.org: https://rubygems.org/ +.. _http endpoint: https://rubygems.org/versions +""" + + +def register(): + from .lister import RubyGemsLister + + return { + "lister": RubyGemsLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py new file mode 100644 index 0000000..c4cb707 --- /dev/null +++ b/swh/lister/rubygems/lister.py @@ -0,0 +1,75 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from typing import Iterator, List, Optional, Text + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +RubyGemsListerPage = Text + + +class RubyGemsLister(StatelessLister[RubyGemsListerPage]): + """Lister for RubyGems.org, the Ruby community’s gem hosting service.""" + + LISTER_NAME = "rubygems" + VISIT_TYPE = "rubygems" + INSTANCE = "rubygems" + + INDEX_URL = "https://rubygems.org/versions" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.INDEX_URL, + ) + + def get_pages(self) -> Iterator[RubyGemsListerPage]: + """Yield an iterator which returns 'page' + + It uses the index file located at `https://rubygems.org/versions` + to get a list of package names. Each page returns an origin url based on + the following pattern:: + + https://rubygems.org/gems/{pkgname} + + """ + + package_names: List[str] = [] + response = self.http_request(url=self.url) + data = response.content.decode() + + # remove the first 3 lines (file headers + first package named '-') + for line in data.splitlines()[3:]: + package_names.append(line.split(" ")[0]) + + # Remove duplicates + package_names_set: List[str] = list(set(package_names)) + + for pkgname in package_names_set: + yield f"https://rubygems.org/gems/{pkgname}" + + def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=page, + last_update=None, + ) diff --git a/swh/lister/rubygems/tasks.py b/swh/lister/rubygems/tasks.py new file mode 100644 index 0000000..a1395d3 --- /dev/null +++ b/swh/lister/rubygems/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.rubygems.lister import RubyGemsLister + + +@shared_task(name=__name__ + ".RubyGemsListerTask") +def list_rubygems(**lister_args): + """Lister task for RubyGems""" + return RubyGemsLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/rubygems/tests/__init__.py b/swh/lister/rubygems/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/rubygems/tests/data/https_rubygems.org/versions b/swh/lister/rubygems/tests/data/https_rubygems.org/versions new file mode 100644 index 0000000..74d2703 --- /dev/null +++ b/swh/lister/rubygems/tests/data/https_rubygems.org/versions @@ -0,0 +1,6 @@ +created_at: 2022-09-01T00:00:05Z +--- +- 1 05d0116933ba44b0b5d0ee19bfd35ccc +mercurial-ruby 0.3.0,0.4.0,0.5.0,0.6.0,0.6.1,0.7.0,0.7.1,0.7.2,0.7.3,0.7.4,0.7.5,0.7.6,0.7.7,0.7.8,0.7.9,0.7.10,0.7.11,0.7.12 3ea9d3b3f1010f06d292dcfcc799f260 +mercurial-wrapper 0.8.4,0.8.5 b6541e48f15eafc0b50fa694cdbffc22 +mercurius 0.0.1,0.0.2,0.0.3,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.1.0,0.1.1,0.1.2,0.1.3,0.1.4,0.1.5,0.1.6,0.1.7,0.1.8,0.1.9,0.2.0,0.2.1 9a388c7c57d2ed4a879ab42520d91ffd diff --git a/swh/lister/rubygems/tests/test_lister.py b/swh/lister/rubygems/tests/test_lister.py new file mode 100644 index 0000000..8a5f355 --- /dev/null +++ b/swh/lister/rubygems/tests/test_lister.py @@ -0,0 +1,27 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.rubygems.lister import RubyGemsLister + +expected_origins = [ + "https://rubygems.org/gems/mercurial-ruby", + "https://rubygems.org/gems/mercurial-wrapper", + "https://rubygems.org/gems/mercurius", +] + + +def test_rubygems_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = RubyGemsLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 3 + assert res.origins == 1 + 1 + 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + for origin in scheduler_origins: + assert origin.visit_type == "rubygems" + assert origin.url in expected_origins diff --git a/swh/lister/rubygems/tests/test_tasks.py b/swh/lister/rubygems/tests/test_tasks.py new file mode 100644 index 0000000..0267dcd --- /dev/null +++ b/swh/lister/rubygems/tests/test_tasks.py @@ -0,0 +1,33 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_rubygems_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.rubygems.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_rubygems_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked RubyGemsLister + lister = mocker.patch("swh.lister.rubygems.tasks.RubyGemsLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task( + "swh.lister.rubygems.tasks.RubyGemsListerTask" + ) + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()