Page MenuHomeSoftware Heritage

D8542.id30804.diff
No OneTemporary

D8542.id30804.diff

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,7 @@
lister.bitbucket=swh.lister.bitbucket:register
lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
+ lister.cpan=swh.lister.cpan:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/__init__.py
@@ -0,0 +1,73 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Cpan lister
+=============
+
+The Cpan lister list origins from `cpan.org`_, the Comprehensive Perl Archive
+Network. It provides search features via `metacpan.org`_.
+
+As of September 2022 `Cpan Forge`_ list 43675 package names.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call a first `http api endpoint`_ that
+retrieve results and a ``_scroll_id`` that will be used to scroll pages through
+`search`_ endpoint.
+
+Page listing
+------------
+
+Each page returns a list of ``results`` which are raw data from api response.
+
+Origins from page
+-----------------
+
+Origin url is the html page corresponding to a package name on `metacpan.org`_, following
+this pattern::
+
+ "https://metacpan.org/dist/{pkgname}"
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/cpan/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker compose up -d
+
+Then schedule a Cpan listing task::
+
+ docker compose exec swh-scheduler swh scheduler task add -p oneshot list-cpan
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+ docker compose logs -f swh-lister
+
+.. _cpan.org: https://cpan.org/
+.. _metacpan.org: https://metacpan.org/
+.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
+.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
+
+
+"""
+
+
+def register():
+ from .lister import CpanLister
+
+ return {
+ "lister": CpanLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/lister.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+CpanListerPage = List[Dict[str, Any]]
+
+
+class CpanLister(StatelessLister[CpanListerPage]):
+ """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive
+ Network."""
+
+ LISTER_NAME = "cpan"
+ VISIT_TYPE = "cpan"
+ INSTANCE = "cpan"
+
+ BASE_URL = "https://fastapi.metacpan.org/v1/"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.BASE_URL,
+ )
+
+ def get_pages(self) -> Iterator[CpanListerPage]:
+ """Yield an iterator which returns 'page'"""
+
+ endpoint = f"{self.BASE_URL}distribution/_search"
+ scrollendpoint = f"{self.BASE_URL}_search/scroll"
+ size: int = 1000
+
+ res = self.http_request(
+ endpoint,
+ params={
+ "fields": ["name"],
+ "size": size,
+ "scroll": "1m",
+ },
+ )
+ data = res.json()["hits"]["hits"]
+ yield data
+
+ _scroll_id = res.json()["_scroll_id"]
+
+ while data:
+ scroll_res = self.http_request(
+ scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id}
+ )
+ data = scroll_res.json()["hits"]["hits"]
+ _scroll_id = scroll_res.json()["_scroll_id"]
+ yield data
+
+ def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ for entry in page:
+ # Skip the entry if 'fields' or 'name' keys are missing
+ if "fields" not in entry or "name" not in entry["fields"]:
+ continue
+
+ pkgname = entry["fields"]["name"]
+ # TODO: Check why sometimes its a one value list
+ if type(pkgname) != str:
+ pkgname = pkgname[0]
+
+ url = f"https://metacpan.org/dist/{pkgname}"
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=url,
+ last_update=None,
+ )
diff --git a/swh/lister/cpan/tasks.py b/swh/lister/cpan/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.cpan.lister import CpanLister
+
+
+@shared_task(name=__name__ + ".CpanListerTask")
+def list_cpan(**lister_args):
+ """Lister task for Cpan"""
+ return CpanLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/cpan/tests/__init__.py b/swh/lister/cpan/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
@@ -0,0 +1,50 @@
+{
+ "_shards" : {
+ "successful" : 3,
+ "total" : 3,
+ "failed" : 0
+ },
+ "timed_out" : false,
+ "hits" : {
+ "max_score" : 1.0,
+ "hits" : [
+ {
+ "_type" : "distribution",
+ "fields" : {
+ "name" : [
+ "EventSource-Server"
+ ]
+ },
+ "_id" : "EventSource-Server",
+ "_index" : "cpan_v1_01",
+ "_score" : 1.0
+ },
+ {
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "Interchange6",
+ "fields" : {
+ "name" : [
+ "Interchange6"
+ ]
+ },
+ "_type" : "distribution"
+ },
+ {
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "Internals-CountObjects",
+ "fields" : {
+ "name" : [
+ "Internals-CountObjects"
+ ]
+ },
+ "_type" : "distribution"
+ }
+ ],
+ "total" : 43675
+ },
+ "took" : 72,
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "terminated_early" : true
+}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
@@ -0,0 +1,16 @@
+{
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "took" : 1,
+ "hits" : {
+ "hits" : [],
+ "total" : 43675,
+ "max_score" : 1.0
+ },
+ "terminated_early" : true,
+ "timed_out" : false,
+ "_shards" : {
+ "failed" : 0,
+ "total" : 3,
+ "successful" : 3
+ }
+}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
@@ -0,0 +1,52 @@
+{
+ "_shards" : {
+ "successful" : 3,
+ "failed" : 0,
+ "total" : 3
+ },
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "took" : 61,
+ "hits" : {
+ "max_score" : 1.0,
+ "hits" : [
+ {
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "openerserver_perl-master",
+ "fields" : {
+ "name" : "openerserver_perl-master"
+ },
+ "_type" : "distribution"
+ },
+ {
+ "_score" : 1.0,
+ "_type" : "distribution",
+ "fields" : {
+ "name" : "Getopt_Auto"
+ },
+ "_id" : "Getopt_Auto",
+ "_index" : "cpan_v1_01"
+ },
+ {
+ "_id" : "App-Booklist",
+ "_index" : "cpan_v1_01",
+ "_type" : "distribution",
+ "fields" : {
+ "name" : "App-Booklist"
+ },
+ "_score" : 1.0
+ },
+ {
+ "fields" : {
+ "name" : "EuclideanRhythm"
+ },
+ "_type" : "distribution",
+ "_index" : "cpan_v1_01",
+ "_id" : "EuclideanRhythm",
+ "_score" : 1.0
+ }
+ ],
+ "total" : 43675
+ },
+ "timed_out" : false
+}
diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/test_lister.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.cpan.lister import CpanLister
+
+expected_origins = [
+ "https://metacpan.org/dist/App-Booklist",
+ "https://metacpan.org/dist/EuclideanRhythm",
+ "https://metacpan.org/dist/EventSource-Server",
+ "https://metacpan.org/dist/Getopt_Auto",
+ "https://metacpan.org/dist/Interchange6",
+ "https://metacpan.org/dist/Internals-CountObjects",
+ "https://metacpan.org/dist/openerserver_perl-master",
+]
+
+
+def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
+ lister = CpanLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 4 + 3 + 0
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ for origin in scheduler_origins:
+ assert origin.visit_type == "cpan"
+ assert origin.url in expected_origins
diff --git a/swh/lister/cpan/tests/test_tasks.py b/swh/lister/cpan/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_cpan_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_cpan_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked CpanLister
+ lister = mocker.patch("swh.lister.cpan.tasks.CpanLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.CpanListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 12:09 PM (15 h, 13 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230407

Event Timeline