Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123596
D8542.id30804.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D8542.id30804.diff
View Options
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,7 @@
lister.bitbucket=swh.lister.bitbucket:register
lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
+ lister.cpan=swh.lister.cpan:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/__init__.py
@@ -0,0 +1,73 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Cpan lister
+=============
+
+The Cpan lister list origins from `cpan.org`_, the Comprehensive Perl Archive
+Network. It provides search features via `metacpan.org`_.
+
+As of September 2022 `Cpan Forge`_ list 43675 package names.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call a first `http api endpoint`_ that
+retrieve results and a ``_scroll_id`` that will be used to scroll pages through
+`search`_ endpoint.
+
+Page listing
+------------
+
+Each page returns a list of ``results`` which are raw data from api response.
+
+Origins from page
+-----------------
+
+Origin url is the html page corresponding to a package name on `metacpan.org`_, following
+this pattern::
+
+ "https://metacpan.org/dist/{pkgname}"
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/cpan/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker compose up -d
+
+Then schedule a Cpan listing task::
+
+ docker compose exec swh-scheduler swh scheduler task add -p oneshot list-cpan
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+ docker compose logs -f swh-lister
+
+.. _cpan.org: https://cpan.org/
+.. _metacpan.org: https://metacpan.org/
+.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
+.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
+
+
+"""
+
+
+def register():
+ from .lister import CpanLister
+
+ return {
+ "lister": CpanLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/lister.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+CpanListerPage = List[Dict[str, Any]]
+
+
+class CpanLister(StatelessLister[CpanListerPage]):
+ """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive
+ Network."""
+
+ LISTER_NAME = "cpan"
+ VISIT_TYPE = "cpan"
+ INSTANCE = "cpan"
+
+ BASE_URL = "https://fastapi.metacpan.org/v1/"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.BASE_URL,
+ )
+
+ def get_pages(self) -> Iterator[CpanListerPage]:
+ """Yield an iterator which returns 'page'"""
+
+ endpoint = f"{self.BASE_URL}distribution/_search"
+ scrollendpoint = f"{self.BASE_URL}_search/scroll"
+ size: int = 1000
+
+ res = self.http_request(
+ endpoint,
+ params={
+ "fields": ["name"],
+ "size": size,
+ "scroll": "1m",
+ },
+ )
+ data = res.json()["hits"]["hits"]
+ yield data
+
+ _scroll_id = res.json()["_scroll_id"]
+
+ while data:
+ scroll_res = self.http_request(
+ scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id}
+ )
+ data = scroll_res.json()["hits"]["hits"]
+ _scroll_id = scroll_res.json()["_scroll_id"]
+ yield data
+
+ def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ for entry in page:
+ # Skip the entry if 'fields' or 'name' keys are missing
+ if "fields" not in entry or "name" not in entry["fields"]:
+ continue
+
+ pkgname = entry["fields"]["name"]
+ # TODO: Check why sometimes its a one value list
+ if type(pkgname) != str:
+ pkgname = pkgname[0]
+
+ url = f"https://metacpan.org/dist/{pkgname}"
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=url,
+ last_update=None,
+ )
diff --git a/swh/lister/cpan/tasks.py b/swh/lister/cpan/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.cpan.lister import CpanLister
+
+
+@shared_task(name=__name__ + ".CpanListerTask")
+def list_cpan(**lister_args):
+ """Lister task for Cpan"""
+ return CpanLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/cpan/tests/__init__.py b/swh/lister/cpan/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
@@ -0,0 +1,50 @@
+{
+ "_shards" : {
+ "successful" : 3,
+ "total" : 3,
+ "failed" : 0
+ },
+ "timed_out" : false,
+ "hits" : {
+ "max_score" : 1.0,
+ "hits" : [
+ {
+ "_type" : "distribution",
+ "fields" : {
+ "name" : [
+ "EventSource-Server"
+ ]
+ },
+ "_id" : "EventSource-Server",
+ "_index" : "cpan_v1_01",
+ "_score" : 1.0
+ },
+ {
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "Interchange6",
+ "fields" : {
+ "name" : [
+ "Interchange6"
+ ]
+ },
+ "_type" : "distribution"
+ },
+ {
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "Internals-CountObjects",
+ "fields" : {
+ "name" : [
+ "Internals-CountObjects"
+ ]
+ },
+ "_type" : "distribution"
+ }
+ ],
+ "total" : 43675
+ },
+ "took" : 72,
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "terminated_early" : true
+}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
@@ -0,0 +1,16 @@
+{
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "took" : 1,
+ "hits" : {
+ "hits" : [],
+ "total" : 43675,
+ "max_score" : 1.0
+ },
+ "terminated_early" : true,
+ "timed_out" : false,
+ "_shards" : {
+ "failed" : 0,
+ "total" : 3,
+ "successful" : 3
+ }
+}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
@@ -0,0 +1,52 @@
+{
+ "_shards" : {
+ "successful" : 3,
+ "failed" : 0,
+ "total" : 3
+ },
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "took" : 61,
+ "hits" : {
+ "max_score" : 1.0,
+ "hits" : [
+ {
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "openerserver_perl-master",
+ "fields" : {
+ "name" : "openerserver_perl-master"
+ },
+ "_type" : "distribution"
+ },
+ {
+ "_score" : 1.0,
+ "_type" : "distribution",
+ "fields" : {
+ "name" : "Getopt_Auto"
+ },
+ "_id" : "Getopt_Auto",
+ "_index" : "cpan_v1_01"
+ },
+ {
+ "_id" : "App-Booklist",
+ "_index" : "cpan_v1_01",
+ "_type" : "distribution",
+ "fields" : {
+ "name" : "App-Booklist"
+ },
+ "_score" : 1.0
+ },
+ {
+ "fields" : {
+ "name" : "EuclideanRhythm"
+ },
+ "_type" : "distribution",
+ "_index" : "cpan_v1_01",
+ "_id" : "EuclideanRhythm",
+ "_score" : 1.0
+ }
+ ],
+ "total" : 43675
+ },
+ "timed_out" : false
+}
diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/test_lister.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.cpan.lister import CpanLister
+
+expected_origins = [
+ "https://metacpan.org/dist/App-Booklist",
+ "https://metacpan.org/dist/EuclideanRhythm",
+ "https://metacpan.org/dist/EventSource-Server",
+ "https://metacpan.org/dist/Getopt_Auto",
+ "https://metacpan.org/dist/Interchange6",
+ "https://metacpan.org/dist/Internals-CountObjects",
+ "https://metacpan.org/dist/openerserver_perl-master",
+]
+
+
+def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
+ lister = CpanLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 4 + 3 + 0
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ for origin in scheduler_origins:
+ assert origin.visit_type == "cpan"
+ assert origin.url in expected_origins
diff --git a/swh/lister/cpan/tests/test_tasks.py b/swh/lister/cpan/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_cpan_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_cpan_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked CpanLister
+ lister = mocker.patch("swh.lister.cpan.tasks.CpanLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.cpan.tasks.CpanListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Dec 19, 12:09 PM (15 h, 13 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230407
Attached To
D8542: Cpan: List Perl module origins from cpan.org
Event Timeline
Log In to Comment