diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register + lister.hackage=swh.lister.hackage:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.opam=swh.lister.opam:register diff --git a/swh/lister/hackage/__init__.py b/swh/lister/hackage/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/__init__.py @@ -0,0 +1,68 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Hackage lister +============== + +The Hackage lister list origins from `hackage.haskell.org`_, the `Haskell`_ Package +Repository. + +The registry provide an `http api`_ from where the lister retrieve package names +and build origins urls. + +As of August 2022 `hackage.haskell.org`_ list 15536 package names. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call `https://hackage.haskell.org/packages` endpoint. + +Page listing +------------ + +There is only one page that list all origins url. + +Origins from page +----------------- + +The lister yields all origins url from one page. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/hackage/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l hackage + +.. _hackage.haskell.org: https://hackage.haskell.org/ +.. _Haskell: https://haskell.org/ +.. _http api: https://hackage.haskell.org/api +""" + + +def register(): + from .lister import HackageLister + + return { + "lister": HackageLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/lister.py @@ -0,0 +1,98 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import logging +from typing import Any, Dict, Iterator, List, Optional + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +HackageListerPage = List[Dict[str, str]] + + +class HackageLister(StatelessLister[HackageListerPage]): + """List Hackage (The Haskell Package Repository) origins.""" + + LISTER_NAME = "hackage" + VISIT_TYPE = "hackage" + INSTANCE = "hackage" + + BASE_URL = "https://hackage.haskell.org/" + PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/" + PACKAGE_INFO_URL_PATTERN = "{base_url}package/{pkgname}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[HackageListerPage]: + """Yield an iterator which returns 'page' + + It uses the api endpoint provided by `https://hackage.haskell.org` + to get a list of package names from which we build an origin url. + + There is only one page that list all package names. + """ + response = self.page_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} + ) + yield response.json() + + def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + for entry in page: + pkgname = entry["packageName"] + url = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=None, + ) diff --git a/swh/lister/hackage/tasks.py b/swh/lister/hackage/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.hackage.lister import HackageLister + + +@shared_task(name=__name__ + ".HackageListerTask") +def list_hackage(**lister_args): + """Lister task for Hackage, the Haskell Package Repository""" + return HackageLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/hackage/tests/__init__.py b/swh/lister/hackage/tests/__init__.py new file mode 100644 diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages @@ -0,0 +1 @@ +[{"packageName":"aeson"}, {"packageName":"colors"}, {"packageName":"servant"}] diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/tests/test_lister.py @@ -0,0 +1,38 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.hackage.lister import HackageLister + +expected_origins = [ + {"name": "aeson", "url": "https://hackage.haskell.org/package/aeson"}, + {"name": "colors", "url": "https://hackage.haskell.org/package/colors"}, + {"name": "servant", "url": "https://hackage.haskell.org/package/servant"}, +] + + +def test_hackage_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = HackageLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 1 + 1 + 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + assert { + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in scheduler_origins + } == { + ( + "hackage", + expected["url"], + ) + for expected in expected_origins + } diff --git a/swh/lister/hackage/tests/test_tasks.py b/swh/lister/hackage/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/tests/test_tasks.py @@ -0,0 +1,33 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_hackage_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.hackage.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_hackage_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked HackageLister + lister = mocker.patch("swh.lister.hackage.tasks.HackageLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task( + "swh.lister.hackage.tasks.HackageListerTask" + ) + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()