diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register lister.conda=swh.lister.conda:register + lister.cpan=swh.lister.cpan:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/__init__.py @@ -0,0 +1,73 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Cpan lister +============= + +The Cpan lister list origins from `cpan.org`_, the Comprehensive Perl Archive +Network. It provides search features via `metacpan.org`_. + +As of September 2022 `cpan.org`_ list 43675 package names. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call a first `http api endpoint`_ that +retrieve results and a ``_scroll_id`` that will be used to scroll pages through +`search`_ endpoint. + +Page listing +------------ + +Each page returns a list of ``results`` which are raw data from api response. + +Origins from page +----------------- + +Origin url is the html page corresponding to a package name on `metacpan.org`_, following +this pattern:: + + "https://metacpan.org/dist/{pkgname}" + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/cpan/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker compose up -d + +Then schedule a Cpan listing task:: + + docker compose exec swh-scheduler swh scheduler task add -p oneshot list-cpan + +You can follow lister execution by displaying logs of swh-lister service:: + + docker compose logs -f swh-lister + +.. _cpan.org: https://cpan.org/ +.. _metacpan.org: https://metacpan.org/ +.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/ +.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950 + + +""" + + +def register(): + from .lister import CpanLister + + return { + "lister": CpanLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/lister.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from typing import Any, Dict, Iterator, List, Optional + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +CpanListerPage = List[Dict[str, Any]] + + +class CpanLister(StatelessLister[CpanListerPage]): + """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive + Network.""" + + LISTER_NAME = "cpan" + VISIT_TYPE = "cpan" + INSTANCE = "cpan" + + BASE_URL = "https://fastapi.metacpan.org/v1/" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + + def get_pages(self) -> Iterator[CpanListerPage]: + """Yield an iterator which returns 'page'""" + + endpoint = f"{self.BASE_URL}distribution/_search" + scrollendpoint = f"{self.BASE_URL}_search/scroll" + size: int = 1000 + + res = self.http_request( + endpoint, + params={ + "fields": ["name"], + "size": size, + "scroll": "1m", + }, + ) + data = res.json()["hits"]["hits"] + yield data + + _scroll_id = res.json()["_scroll_id"] + + while data: + scroll_res = self.http_request( + scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id} + ) + data = scroll_res.json()["hits"]["hits"] + _scroll_id = scroll_res.json()["_scroll_id"] + yield data + + def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + for entry in page: + # Skip the entry if 'fields' or 'name' keys are missing + if "fields" not in entry or "name" not in entry["fields"]: + continue + + pkgname = entry["fields"]["name"] + # TODO: Check why sometimes its a one value list + if type(pkgname) != str: + pkgname = pkgname[0] + + url = f"https://metacpan.org/dist/{pkgname}" + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=None, + ) diff --git a/swh/lister/cpan/tasks.py b/swh/lister/cpan/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.cpan.lister import CpanLister + + +@shared_task(name=__name__ + ".CpanListerTask") +def list_cpan(**lister_args): + """Lister task for Cpan""" + return CpanLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/cpan/tests/__init__.py b/swh/lister/cpan/tests/__init__.py new file mode 100644 diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== @@ -0,0 +1,50 @@ +{ + "_shards" : { + "successful" : 3, + "total" : 3, + "failed" : 0 + }, + "timed_out" : false, + "hits" : { + "max_score" : 1.0, + "hits" : [ + { + "_type" : "distribution", + "fields" : { + "name" : [ + "EventSource-Server" + ] + }, + "_id" : "EventSource-Server", + "_index" : "cpan_v1_01", + "_score" : 1.0 + }, + { + "_score" : 1.0, + "_index" : "cpan_v1_01", + "_id" : "Interchange6", + "fields" : { + "name" : [ + "Interchange6" + ] + }, + "_type" : "distribution" + }, + { + "_score" : 1.0, + "_index" : "cpan_v1_01", + "_id" : "Internals-CountObjects", + "fields" : { + "name" : [ + "Internals-CountObjects" + ] + }, + "_type" : "distribution" + } + ], + "total" : 43675 + }, + "took" : 72, + "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", + "terminated_early" : true +} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 @@ -0,0 +1,16 @@ +{ + "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", + "took" : 1, + "hits" : { + "hits" : [], + "total" : 43675, + "max_score" : 1.0 + }, + "terminated_early" : true, + "timed_out" : false, + "_shards" : { + "failed" : 0, + "total" : 3, + "successful" : 3 + } +} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m @@ -0,0 +1,52 @@ +{ + "_shards" : { + "successful" : 3, + "failed" : 0, + "total" : 3 + }, + "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", + "took" : 61, + "hits" : { + "max_score" : 1.0, + "hits" : [ + { + "_score" : 1.0, + "_index" : "cpan_v1_01", + "_id" : "openerserver_perl-master", + "fields" : { + "name" : "openerserver_perl-master" + }, + "_type" : "distribution" + }, + { + "_score" : 1.0, + "_type" : "distribution", + "fields" : { + "name" : "Getopt_Auto" + }, + "_id" : "Getopt_Auto", + "_index" : "cpan_v1_01" + }, + { + "_id" : "App-Booklist", + "_index" : "cpan_v1_01", + "_type" : "distribution", + "fields" : { + "name" : "App-Booklist" + }, + "_score" : 1.0 + }, + { + "fields" : { + "name" : "EuclideanRhythm" + }, + "_type" : "distribution", + "_index" : "cpan_v1_01", + "_id" : "EuclideanRhythm", + "_score" : 1.0 + } + ], + "total" : 43675 + }, + "timed_out" : false +} diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/test_lister.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.cpan.lister import CpanLister + +expected_origins = [ + "https://metacpan.org/dist/App-Booklist", + "https://metacpan.org/dist/EuclideanRhythm", + "https://metacpan.org/dist/EventSource-Server", + "https://metacpan.org/dist/Getopt_Auto", + "https://metacpan.org/dist/Interchange6", + "https://metacpan.org/dist/Internals-CountObjects", + "https://metacpan.org/dist/openerserver_perl-master", +] + + +def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler): + lister = CpanLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 3 + assert res.origins == 4 + 3 + 0 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + for origin in scheduler_origins: + assert origin.visit_type == "cpan" + assert origin.url in expected_origins diff --git a/swh/lister/cpan/tests/test_tasks.py b/swh/lister/cpan/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_cpan_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_cpan_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked CpanLister + lister = mocker.patch("swh.lister.cpan.tasks.CpanLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.CpanListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()