Page MenuHomeSoftware Heritage

D8615.diff
No OneTemporary

D8615.diff

diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py
--- a/swh/lister/cpan/__init__.py
+++ b/swh/lister/cpan/__init__.py
@@ -16,9 +16,9 @@
Origins retrieving strategy
---------------------------
-To get a list of all package names we call a first `http api endpoint`_ that
-retrieve results and a ``_scroll_id`` that will be used to scroll pages through
-`search`_ endpoint.
+To get a list of all package names and their associated release artifacts we call
+a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will
+be used to scroll pages through `search`_ endpoint.
Page listing
------------
@@ -57,7 +57,7 @@
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
-.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
+.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
--- a/swh/lister/cpan/lister.py
+++ b/swh/lister/cpan/lister.py
@@ -3,8 +3,12 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from datetime import datetime
import logging
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Set
+
+import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -14,7 +18,33 @@
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
-CpanListerPage = List[Dict[str, Any]]
+CpanListerPage = Set[str]
+
+
+def get_field_value(entry, field_name):
+ """
+ Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
+ dictionary. If a value does not exist, returns None.
+
+ >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}}
+ >>> get_field_value(entry, "foo")
+ 1
+ >>> get_field_value(entry, "bar")
+ {'baz': 2, 'qux': [3]}
+ >>> get_field_value(entry, "bar.baz")
+ 2
+ >>> get_field_value(entry, "bar.qux")
+ 3
+ """
+ fields = field_name.split(".")
+ field_value = entry["_source"]
+ for field in fields[:-1]:
+ field_value = field_value.get(field, {})
+ field_value = field_value.get(fields[-1])
+ # scrolled results might have field value in a list
+ if isinstance(field_value, list):
+ field_value = field_value[0]
+ return field_value
class CpanLister(StatelessLister[CpanListerPage]):
@@ -25,7 +55,15 @@
VISIT_TYPE = "cpan"
INSTANCE = "cpan"
- BASE_URL = "https://fastapi.metacpan.org/v1/"
+ API_BASE_URL = "https://fastapi.metacpan.org/v1"
+ REQUIRED_DOC_FIELDS = [
+ "download_url",
+ "checksum_sha256",
+ "distribution",
+ "version",
+ ]
+ OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"]
+ ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}"
def __init__(
self,
@@ -36,26 +74,82 @@
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
- url=self.BASE_URL,
+ url=self.API_BASE_URL,
)
+ self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+ self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+ self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
+ self.module_names: Set[str] = set()
+
+ def process_release_page(self, page: List[Dict[str, Any]]):
+ for entry in page:
+
+ if "_source" not in entry or not all(
+ k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS
+ ):
+ logger.warning(
+ "Skipping release entry %s as some required fields are missing",
+ entry.get("_source"),
+ )
+ continue
+
+ module_name = get_field_value(entry, "distribution")
+ module_version = get_field_value(entry, "version")
+ module_download_url = get_field_value(entry, "download_url")
+ module_sha256_checksum = get_field_value(entry, "checksum_sha256")
+ module_date = get_field_value(entry, "date")
+ module_size = get_field_value(entry, "stat.size")
+ module_author = get_field_value(entry, "author")
+ module_author_fullname = get_field_value(entry, "metadata.author")
+ release_name = get_field_value(entry, "name")
+
+ self.artifacts[module_name].append(
+ {
+ "url": module_download_url,
+ "filename": module_download_url.split("/")[-1],
+ "checksums": {"sha256": module_sha256_checksum},
+ "version": module_version,
+ "length": module_size,
+ }
+ )
+
+ self.module_metadata[module_name].append(
+ {
+ "name": module_name,
+ "version": module_version,
+ "cpan_author": module_author,
+ "author": (
+ module_author_fullname
+ if module_author_fullname not in (None, "", "unknown")
+ else module_author
+ ),
+ "date": module_date,
+ "release_name": release_name,
+ }
+ )
+
+ self.release_dates[module_name].append(iso8601.parse_date(module_date))
+
+ self.module_names.add(module_name)
+
def get_pages(self) -> Iterator[CpanListerPage]:
"""Yield an iterator which returns 'page'"""
- endpoint = f"{self.BASE_URL}distribution/_search"
- scrollendpoint = f"{self.BASE_URL}_search/scroll"
- size: int = 1000
+ endpoint = f"{self.API_BASE_URL}/release/_search"
+ scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
+ size = 1000
res = self.http_request(
endpoint,
params={
- "fields": ["name"],
+ "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
"size": size,
"scroll": "1m",
},
)
data = res.json()["hits"]["hits"]
- yield data
+ self.process_release_page(data)
_scroll_id = res.json()["_scroll_id"]
@@ -65,27 +159,25 @@
)
data = scroll_res.json()["hits"]["hits"]
_scroll_id = scroll_res.json()["_scroll_id"]
- yield data
+ self.process_release_page(data)
- def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
+ yield self.module_names
+
+ def get_origins_from_page(
+ self, module_names: CpanListerPage
+ ) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
- for entry in page:
- # Skip the entry if 'fields' or 'name' keys are missing
- if "fields" not in entry or "name" not in entry["fields"]:
- continue
-
- pkgname = entry["fields"]["name"]
- # TODO: Check why sometimes its a one value list
- if type(pkgname) != str:
- pkgname = pkgname[0]
-
- url = f"https://metacpan.org/dist/{pkgname}"
-
+ for module_name in module_names:
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
- url=url,
- last_update=None,
+ url=self.ORIGIN_URL_PATTERN.format(module_name=module_name),
+ last_update=max(self.release_dates[module_name]),
+ extra_loader_arguments={
+ "api_base_url": self.API_BASE_URL,
+ "artifacts": self.artifacts[module_name],
+ "module_metadata": self.module_metadata[module_name],
+ },
)
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll
@@ -0,0 +1,247 @@
+{
+ "_shards": {
+ "failed": 0,
+ "total": 3,
+ "successful": 3
+ },
+ "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "terminated_early": true,
+ "took": 3,
+ "hits": {
+ "max_score": 1.0,
+ "hits": [
+ {
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1210",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1210.tar.gz",
+ "version": "0.1210",
+ "distribution": "Validator-Custom",
+ "date": "2010-08-14T01:41:56",
+ "stat": {
+ "size": 17608
+ },
+ "checksum_sha256": "f7240f7793ced2952701f0ed28ecf43c07cc2fa4549cc505831eceb8424cba4a",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_id": "VGApYqMT4TCxUzHcITn8ZhGHlxE"
+ },
+ {
+ "_type": "release",
+ "_id": "ilQN4bpIIdRl6DoiB3y47fgNIk8",
+ "_index": "cpan_v1_01",
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1208",
+ "date": "2010-07-28T23:00:52",
+ "distribution": "Validator-Custom",
+ "version": "0.1208",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1208.tar.gz",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "e33a860b026cad852eb919da4a3645007b47e5f414eb7272534b10cee279b52b",
+ "stat": {
+ "size": 17489
+ }
+ }
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1619",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1619.tar.gz",
+ "version": "0.1619",
+ "distribution": "DBIx-Custom",
+ "date": "2010-10-20T15:01:35",
+ "stat": {
+ "size": 27195
+ },
+ "checksum_sha256": "83c295343f48ebc03029139082345c93527ffe5831820f99e4a72ee67ef186a5",
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ }
+ },
+ "_score": 1.0,
+ "_id": "g7562_4h9d693lxvc_cgEOTJAZk",
+ "_index": "cpan_v1_01",
+ "_type": "release"
+ },
+ {
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1401",
+ "version": "0.1401",
+ "distribution": "DBIx-Custom",
+ "date": "2010-05-01T23:29:22",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1401.tar.gz",
+ "checksum_sha256": "004be1d48b6819941b3cb3c53bf457799d811348e0bb15e7cf18211505637aba",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "stat": {
+ "size": 22711
+ }
+ },
+ "_type": "release",
+ "_id": "bLRsOH2sevNQ6Q93exgkvCZONo0",
+ "_index": "cpan_v1_01"
+ },
+ {
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_id": "D8L3qWKznn0IQZrZEeDi9uyXbJY",
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1301",
+ "stat": {
+ "size": 22655
+ },
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "6b39e3ad2bc98f06af3a75c96cd8c056a25f7501ed216a375472c8fe7bbb72be",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1301.tar.gz",
+ "distribution": "DBIx-Custom",
+ "version": "0.1301",
+ "date": "2010-05-01T13:02:19"
+ }
+ },
+ {
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1602",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "7a7e18514e171a6c55ef4c8aef92bd548b15ffd7dec4c1fdc83c276a032f6b8a",
+ "stat": {
+ "size": 18999
+ },
+ "date": "2010-06-25T12:11:33",
+ "distribution": "DBIx-Custom",
+ "version": "0.1602",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1602.tar.gz"
+ },
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_id": "kmzgsMLGdsuiHjrSW55lLwMRO4o"
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1204",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1204.tar.gz",
+ "version": "0.1204",
+ "distribution": "Validator-Custom",
+ "date": "2010-07-08T13:14:23",
+ "stat": {
+ "size": 13256
+ },
+ "checksum_sha256": "40800b3d92cebc09967b61725cecdd05de2b04649f095e3034c5dd82f3d4ad89",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_id": "M_lLALu56mb_cDK_jAXwUB2PUlw",
+ "_type": "release"
+ },
+ {
+ "_id": "EVuvfiFcvtEr9Ne5Q4QoMAaxe7E",
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1203",
+ "stat": {
+ "size": 12572
+ },
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "028a0b41c152c585143167464bed2ac6b6680c8006aa80867f9a8faa4ca5efe7",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1203.tar.gz",
+ "date": "2010-07-07T13:29:41",
+ "distribution": "Validator-Custom",
+ "version": "0.1203"
+ },
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_id": "ZaT8bwXejVTHmrzZCqNJPRFImBY",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1641",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "940412af9b7faf4c946a5e4d57ca52e5b704e49c4d7d0aa5ecb6d2286477ebc6",
+ "stat": {
+ "size": 40480
+ },
+ "distribution": "DBIx-Custom",
+ "version": "0.1641",
+ "date": "2011-01-27T05:19:14",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1641.tar.gz"
+ },
+ "_score": 1.0
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1646",
+ "version": "0.1646",
+ "distribution": "DBIx-Custom",
+ "date": "2011-02-18T17:48:52",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1646.tar.gz",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "7f729311e3e22d36b158e62b42ab2fbd29f08eabd57206e235db939d1ae57d24",
+ "stat": {
+ "size": 46577
+ }
+ },
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_id": "j21QIzHRYZKz1vobyGAPa2BuO50",
+ "_type": "release"
+ }
+ ],
+ "total": 359941
+ },
+ "timed_out": false
+}
\ No newline at end of file
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
deleted file mode 100644
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
+++ /dev/null
@@ -1,50 +0,0 @@
-{
- "_shards" : {
- "successful" : 3,
- "total" : 3,
- "failed" : 0
- },
- "timed_out" : false,
- "hits" : {
- "max_score" : 1.0,
- "hits" : [
- {
- "_type" : "distribution",
- "fields" : {
- "name" : [
- "EventSource-Server"
- ]
- },
- "_id" : "EventSource-Server",
- "_index" : "cpan_v1_01",
- "_score" : 1.0
- },
- {
- "_score" : 1.0,
- "_index" : "cpan_v1_01",
- "_id" : "Interchange6",
- "fields" : {
- "name" : [
- "Interchange6"
- ]
- },
- "_type" : "distribution"
- },
- {
- "_score" : 1.0,
- "_index" : "cpan_v1_01",
- "_id" : "Internals-CountObjects",
- "fields" : {
- "name" : [
- "Internals-CountObjects"
- ]
- },
- "_type" : "distribution"
- }
- ],
- "total" : 43675
- },
- "took" : 72,
- "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
- "terminated_early" : true
-}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
deleted file mode 100644
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
+++ /dev/null
@@ -1,16 +0,0 @@
-{
- "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
- "took" : 1,
- "hits" : {
- "hits" : [],
- "total" : 43675,
- "max_score" : 1.0
- },
- "terminated_early" : true,
- "timed_out" : false,
- "_shards" : {
- "failed" : 0,
- "total" : 3,
- "successful" : 3
- }
-}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
deleted file mode 100644
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
+++ /dev/null
@@ -1,52 +0,0 @@
-{
- "_shards" : {
- "successful" : 3,
- "failed" : 0,
- "total" : 3
- },
- "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
- "took" : 61,
- "hits" : {
- "max_score" : 1.0,
- "hits" : [
- {
- "_score" : 1.0,
- "_index" : "cpan_v1_01",
- "_id" : "openerserver_perl-master",
- "fields" : {
- "name" : "openerserver_perl-master"
- },
- "_type" : "distribution"
- },
- {
- "_score" : 1.0,
- "_type" : "distribution",
- "fields" : {
- "name" : "Getopt_Auto"
- },
- "_id" : "Getopt_Auto",
- "_index" : "cpan_v1_01"
- },
- {
- "_id" : "App-Booklist",
- "_index" : "cpan_v1_01",
- "_type" : "distribution",
- "fields" : {
- "name" : "App-Booklist"
- },
- "_score" : 1.0
- },
- {
- "fields" : {
- "name" : "EuclideanRhythm"
- },
- "_type" : "distribution",
- "_index" : "cpan_v1_01",
- "_id" : "EuclideanRhythm",
- "_score" : 1.0
- }
- ],
- "total" : 43675
- },
- "timed_out" : false
-}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
@@ -0,0 +1,246 @@
+{
+ "timed_out": false,
+ "_shards": {
+ "total": 3,
+ "failed": 0,
+ "successful": 3
+ },
+ "hits": {
+ "hits": [
+ {
+ "_index": "cpan_v1_01",
+ "_id": "40MmOvf_SQx_mr8Kj9Eush14a3E",
+ "_source": {
+ "author": "KRYDE",
+ "name": "math-image-46",
+ "date": "2011-03-02T00:46:14",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KR/KRYDE/math-image-46.tar.gz",
+ "checksum_sha256": "6bd988e3959feb1071d3b9953d16e723af66bdb7b5440ea17add8709d95f20fa",
+ "version": "46",
+ "stat": {
+ "size": 533502
+ },
+ "distribution": "math-image",
+ "metadata": {
+ "author": [
+ "Kevin Ryde <user42@zip.com.au>"
+ ]
+ }
+ },
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "MITHALDU",
+ "name": "Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL",
+ "distribution": "Dist-Zilla-Plugin-ProgCriticTests",
+ "metadata": {
+ "author": [
+ "Christian Walde <mithaldu@yahoo.de>"
+ ]
+ },
+ "stat": {
+ "size": 16918
+ },
+ "checksum_sha256": "ef8c92d0fc55551392a6daeee20a1c13a3ee1bcd0fcacf611cbc2a6cc503f401",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHALDU/Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL.tar.gz",
+ "date": "2010-06-07T14:43:36",
+ "version": "1.101580"
+ },
+ "_id": "6df77_MLO_BG8YC_vQKsay7OFYM",
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "MITHUN",
+ "name": "Net-Rapidshare-v0.04",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.04.tar.gz",
+ "date": "2009-07-28T05:57:26",
+ "checksum_sha256": "f01456a8f8c2b6806a8dd041cf848f330884573d363b28c8b3ff12e837fa8f4f",
+ "version": "v0.04",
+ "distribution": "Net-Rapidshare",
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ },
+ "stat": {
+ "size": 15068
+ }
+ },
+ "_id": "jCs3ZLWuoetrkMLOFKV3YTSr_fM",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "MITHUN",
+ "name": "Net-Rapidshare-v0.05",
+ "version": "v0.05",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.05.tgz",
+ "date": "2009-12-21T00:29:48",
+ "checksum_sha256": "e1128d3b35616530d9722d0fe3f5f0e343fd914bc8f9c0df55c1a9ad6c7402fe",
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ },
+ "distribution": "Net-Rapidshare",
+ "stat": {
+ "size": 15971
+ }
+ },
+ "_id": "pExMIwabhz_0S1rX7xAY_lq0GTY",
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_type": "release",
+ "_source": {
+ "author": "MITHUN",
+ "name": "Net-Rapidshare-v0.0.1",
+ "version": "v0.0.1",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.0.1.tar.gz",
+ "checksum_sha256": "990de0a72753fa182e7a5867e55fd6755375b71280bb7e5b3a5f07c4de8af905",
+ "date": "2009-07-18T22:56:38",
+ "stat": {
+ "size": 15161
+ },
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ },
+ "distribution": "Net-Rapidshare"
+ },
+ "_id": "eqkhDnj0efXHisWRrMZZ1EHFgug",
+ "_index": "cpan_v1_01",
+ "_score": 1.0
+ },
+ {
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-Basic-0.0101",
+ "stat": {
+ "size": 3409
+ },
+ "distribution": "DBIx-Custom-Basic",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-Basic-0.0101.tar.gz",
+ "date": "2009-11-08T04:18:30",
+ "checksum_sha256": "86f68b2d0789934aa6b0202345e9807c5b650f8030b55d0d669ef25293fa3f1f",
+ "version": "0.0101"
+ },
+ "_id": "oKf3t0pXHXa6mZ_4sUZSaSMKuXg"
+ },
+ {
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-SQLite-0.0101",
+ "version": "0.0101",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-SQLite-0.0101.tar.gz",
+ "date": "2009-11-08T04:20:31",
+ "checksum_sha256": "0af123551dff95f9654f4fbc24e945c5d6481b92e67b8e03ca91ef4c83088cc7",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "distribution": "DBIx-Custom-SQLite",
+ "stat": {
+ "size": 3927
+ }
+ },
+ "_type": "release",
+ "_id": "zpVA3zMoUhx0mj8Cn4YC9CuFyA8"
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-Ext-Mojolicious-0.0103",
+ "checksum_sha256": "0911fe6ae65f9173c6eb68b6116600552b088939b94881be3c7275344b1cbdce",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0103.tar.gz",
+ "date": "2010-01-16T14:51:11",
+ "version": "0.0103",
+ "stat": {
+ "size": 4190
+ },
+ "distribution": "Validator-Custom-Ext-Mojolicious",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_id": "mY_jP2O7NnTtr3utv_xZQNu10Ic",
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-Ext-Mojolicious-0.0102",
+ "stat": {
+ "size": 4257
+ },
+ "distribution": "Validator-Custom-Ext-Mojolicious",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "date": "2010-01-15T14:07:24",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0102.tar.gz",
+ "checksum_sha256": "a88d01504353223f7a3cb0d6a240debb9c6d6155858f1048a19007c3b366beed",
+ "version": "0.0102"
+ },
+ "_id": "WZm6hQ6mBfOqgVE6dPQOE0L8hg0",
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1207",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1207.tar.gz",
+ "date": "2010-07-28T13:42:23",
+ "checksum_sha256": "f599da2ecc17ac74443628eb84233ee6b25b204511f83ea778dad9efd0f558e0",
+ "version": "0.1207",
+ "stat": {
+ "size": 16985
+ },
+ "distribution": "Validator-Custom",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_id": "NWJOqmjEinjfJqawfpkEpEhu4d0",
+ "_score": 1.0
+ }
+ ],
+ "total": 359941,
+ "max_score": 1.0
+ },
+ "took": 14,
+ "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw=="
+}
\ No newline at end of file
diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py
--- a/swh/lister/cpan/tests/test_lister.py
+++ b/swh/lister/cpan/tests/test_lister.py
@@ -3,25 +3,95 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from itertools import chain
+import json
+from pathlib import Path
+
+import pytest
+
from swh.lister.cpan.lister import CpanLister
-expected_origins = [
- "https://metacpan.org/dist/App-Booklist",
- "https://metacpan.org/dist/EuclideanRhythm",
- "https://metacpan.org/dist/EventSource-Server",
- "https://metacpan.org/dist/Getopt_Auto",
- "https://metacpan.org/dist/Interchange6",
- "https://metacpan.org/dist/Internals-CountObjects",
- "https://metacpan.org/dist/openerserver_perl-master",
-]
+@pytest.fixture
+def release_search_response(datadir):
+ return json.loads(
+ Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes()
+ )
+
+
+@pytest.fixture
+def release_scroll_first_response(datadir):
+ return json.loads(
+ Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes()
+ )
-def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
+
+@pytest.fixture(autouse=True)
+def mock_network_requests(
+ requests_mock, release_search_response, release_scroll_first_response
+):
+ requests_mock.get(
+ "https://fastapi.metacpan.org/v1/release/_search",
+ json=release_search_response,
+ )
+ requests_mock.get(
+ "https://fastapi.metacpan.org/v1/_search/scroll",
+ [
+ {
+ "json": release_scroll_first_response,
+ },
+ {"json": {"hits": {"hits": []}, "_scroll_id": ""}},
+ ],
+ )
+
+
+def test_cpan_lister(
+ swh_scheduler, release_search_response, release_scroll_first_response
+):
lister = CpanLister(scheduler=swh_scheduler)
res = lister.run()
- assert res.pages == 3
- assert res.origins == 4 + 3 + 0
+ expected_origins = set()
+ expected_artifacts = defaultdict(list)
+ expected_module_metadata = defaultdict(list)
+ for release in chain(
+ release_search_response["hits"]["hits"],
+ release_scroll_first_response["hits"]["hits"],
+ ):
+ distribution = release["_source"]["distribution"]
+ release_name = release["_source"]["name"]
+ checksum_sha256 = release["_source"]["checksum_sha256"]
+ download_url = release["_source"]["download_url"]
+ version = release["_source"]["version"]
+ size = release["_source"]["stat"]["size"]
+ author = release["_source"]["author"]
+ author_fullname = release["_source"]["metadata"]["author"][0]
+ date = release["_source"]["date"]
+ origin_url = f"https://metacpan.org/dist/{distribution}"
+ expected_origins.add(origin_url)
+ expected_artifacts[origin_url].append(
+ {
+ "url": download_url,
+ "filename": download_url.split("/")[-1],
+ "version": version,
+ "length": size,
+ "checksums": {"sha256": checksum_sha256},
+ }
+ )
+ expected_module_metadata[origin_url].append(
+ {
+ "name": distribution,
+ "version": version,
+ "cpan_author": author,
+ "author": author_fullname if author_fullname != "unknown" else author,
+ "date": date,
+ "release_name": release_name,
+ }
+ )
+
+ assert res.pages == 1
+ assert res.origins == len(expected_origins)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
@@ -30,3 +100,8 @@
for origin in scheduler_origins:
assert origin.visit_type == "cpan"
assert origin.url in expected_origins
+ assert origin.extra_loader_arguments == {
+ "api_base_url": "https://fastapi.metacpan.org/v1",
+ "artifacts": expected_artifacts[origin.url],
+ "module_metadata": expected_module_metadata[origin.url],
+ }

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 6:52 AM (10 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217094

Event Timeline