Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123517
D8615.id31260.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
36 KB
Subscribers
None
D8615.id31260.diff
View Options
diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py
--- a/swh/lister/cpan/__init__.py
+++ b/swh/lister/cpan/__init__.py
@@ -16,9 +16,9 @@
Origins retrieving strategy
---------------------------
-To get a list of all package names we call a first `http api endpoint`_ that
-retrieve results and a ``_scroll_id`` that will be used to scroll pages through
-`search`_ endpoint.
+To get a list of all package names and their associated release artifacts we call
+a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will
+be used to scroll pages through `search`_ endpoint.
Page listing
------------
@@ -57,7 +57,7 @@
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
-.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/
+.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
--- a/swh/lister/cpan/lister.py
+++ b/swh/lister/cpan/lister.py
@@ -3,8 +3,12 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from datetime import datetime
import logging
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Set
+
+import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@@ -14,7 +18,33 @@
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
-CpanListerPage = List[Dict[str, Any]]
+CpanListerPage = Set[str]
+
+
+def get_field_value(entry, field_name):
+ """
+ Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
+ dictionary. If a value does not exist, returns None.
+
+ >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}}
+ >>> get_field_value(entry, "foo")
+ 1
+ >>> get_field_value(entry, "bar")
+ {'baz': 2, 'qux': [3]}
+ >>> get_field_value(entry, "bar.baz")
+ 2
+ >>> get_field_value(entry, "bar.qux")
+ 3
+ """
+ fields = field_name.split(".")
+ field_value = entry["_source"]
+ for field in fields[:-1]:
+ field_value = field_value.get(field, {})
+ field_value = field_value.get(fields[-1])
+ # scrolled results might have field value in a list
+ if isinstance(field_value, list):
+ field_value = field_value[0]
+ return field_value
class CpanLister(StatelessLister[CpanListerPage]):
@@ -25,7 +55,15 @@
VISIT_TYPE = "cpan"
INSTANCE = "cpan"
- BASE_URL = "https://fastapi.metacpan.org/v1/"
+ API_BASE_URL = "https://fastapi.metacpan.org/v1"
+ REQUIRED_DOC_FIELDS = [
+ "download_url",
+ "checksum_sha256",
+ "distribution",
+ "version",
+ ]
+ OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"]
+ ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}"
def __init__(
self,
@@ -36,26 +74,82 @@
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
- url=self.BASE_URL,
+ url=self.API_BASE_URL,
)
+ self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+ self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+ self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
+ self.module_names: Set[str] = set()
+
+ def process_release_page(self, page: List[Dict[str, Any]]):
+ for entry in page:
+
+ if "_source" not in entry or not all(
+ k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS
+ ):
+ logger.warning(
+ "Skipping release entry %s as some required fields are missing",
+ entry.get("_source"),
+ )
+ continue
+
+ module_name = get_field_value(entry, "distribution")
+ module_version = get_field_value(entry, "version")
+ module_download_url = get_field_value(entry, "download_url")
+ module_sha256_checksum = get_field_value(entry, "checksum_sha256")
+ module_date = get_field_value(entry, "date")
+ module_size = get_field_value(entry, "stat.size")
+ module_author = get_field_value(entry, "author")
+ module_author_fullname = get_field_value(entry, "metadata.author")
+ release_name = get_field_value(entry, "name")
+
+ self.artifacts[module_name].append(
+ {
+ "url": module_download_url,
+ "filename": module_download_url.split("/")[-1],
+ "checksums": {"sha256": module_sha256_checksum},
+ "version": module_version,
+ "length": module_size,
+ }
+ )
+
+ self.module_metadata[module_name].append(
+ {
+ "name": module_name,
+ "version": module_version,
+ "cpan_author": module_author,
+ "author": (
+ module_author_fullname
+ if module_author_fullname not in (None, "", "unknown")
+ else module_author
+ ),
+ "date": module_date,
+ "release_name": release_name,
+ }
+ )
+
+ self.release_dates[module_name].append(iso8601.parse_date(module_date))
+
+ self.module_names.add(module_name)
+
def get_pages(self) -> Iterator[CpanListerPage]:
"""Yield an iterator which returns 'page'"""
- endpoint = f"{self.BASE_URL}distribution/_search"
- scrollendpoint = f"{self.BASE_URL}_search/scroll"
- size: int = 1000
+ endpoint = f"{self.API_BASE_URL}/release/_search"
+ scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
+ size = 1000
res = self.http_request(
endpoint,
params={
- "fields": ["name"],
+ "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
"size": size,
"scroll": "1m",
},
)
data = res.json()["hits"]["hits"]
- yield data
+ self.process_release_page(data)
_scroll_id = res.json()["_scroll_id"]
@@ -65,27 +159,26 @@
)
data = scroll_res.json()["hits"]["hits"]
_scroll_id = scroll_res.json()["_scroll_id"]
- yield data
+ self.process_release_page(data)
- def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]:
+ yield self.module_names
+
+ def get_origins_from_page(
+ self, module_names: CpanListerPage
+ ) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
- for entry in page:
- # Skip the entry if 'fields' or 'name' keys are missing
- if "fields" not in entry or "name" not in entry["fields"]:
- continue
-
- pkgname = entry["fields"]["name"]
- # TODO: Check why sometimes its a one value list
- if type(pkgname) != str:
- pkgname = pkgname[0]
-
- url = f"https://metacpan.org/dist/{pkgname}"
-
+ for module_name in module_names:
+ module_metadata = self.module_metadata[module_name]
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
- url=url,
- last_update=None,
+ url=self.ORIGIN_URL_PATTERN.format(module_name=module_name),
+ last_update=max(self.release_dates[module_name]),
+ extra_loader_arguments={
+ "api_base_url": self.API_BASE_URL,
+ "artifacts": self.artifacts[module_name],
+ "module_metadata": module_metadata,
+ },
)
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll
@@ -0,0 +1,247 @@
+{
+ "_shards": {
+ "failed": 0,
+ "total": 3,
+ "successful": 3
+ },
+ "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
+ "terminated_early": true,
+ "took": 3,
+ "hits": {
+ "max_score": 1.0,
+ "hits": [
+ {
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1210",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1210.tar.gz",
+ "version": "0.1210",
+ "distribution": "Validator-Custom",
+ "date": "2010-08-14T01:41:56",
+ "stat": {
+ "size": 17608
+ },
+ "checksum_sha256": "f7240f7793ced2952701f0ed28ecf43c07cc2fa4549cc505831eceb8424cba4a",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_id": "VGApYqMT4TCxUzHcITn8ZhGHlxE"
+ },
+ {
+ "_type": "release",
+ "_id": "ilQN4bpIIdRl6DoiB3y47fgNIk8",
+ "_index": "cpan_v1_01",
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1208",
+ "date": "2010-07-28T23:00:52",
+ "distribution": "Validator-Custom",
+ "version": "0.1208",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1208.tar.gz",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "e33a860b026cad852eb919da4a3645007b47e5f414eb7272534b10cee279b52b",
+ "stat": {
+ "size": 17489
+ }
+ }
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1619",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1619.tar.gz",
+ "version": "0.1619",
+ "distribution": "DBIx-Custom",
+ "date": "2010-10-20T15:01:35",
+ "stat": {
+ "size": 27195
+ },
+ "checksum_sha256": "83c295343f48ebc03029139082345c93527ffe5831820f99e4a72ee67ef186a5",
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ }
+ },
+ "_score": 1.0,
+ "_id": "g7562_4h9d693lxvc_cgEOTJAZk",
+ "_index": "cpan_v1_01",
+ "_type": "release"
+ },
+ {
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1401",
+ "version": "0.1401",
+ "distribution": "DBIx-Custom",
+ "date": "2010-05-01T23:29:22",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1401.tar.gz",
+ "checksum_sha256": "004be1d48b6819941b3cb3c53bf457799d811348e0bb15e7cf18211505637aba",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "stat": {
+ "size": 22711
+ }
+ },
+ "_type": "release",
+ "_id": "bLRsOH2sevNQ6Q93exgkvCZONo0",
+ "_index": "cpan_v1_01"
+ },
+ {
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_id": "D8L3qWKznn0IQZrZEeDi9uyXbJY",
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1301",
+ "stat": {
+ "size": 22655
+ },
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "6b39e3ad2bc98f06af3a75c96cd8c056a25f7501ed216a375472c8fe7bbb72be",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1301.tar.gz",
+ "distribution": "DBIx-Custom",
+ "version": "0.1301",
+ "date": "2010-05-01T13:02:19"
+ }
+ },
+ {
+ "_score": 1.0,
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1602",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "7a7e18514e171a6c55ef4c8aef92bd548b15ffd7dec4c1fdc83c276a032f6b8a",
+ "stat": {
+ "size": 18999
+ },
+ "date": "2010-06-25T12:11:33",
+ "distribution": "DBIx-Custom",
+ "version": "0.1602",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1602.tar.gz"
+ },
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_id": "kmzgsMLGdsuiHjrSW55lLwMRO4o"
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1204",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1204.tar.gz",
+ "version": "0.1204",
+ "distribution": "Validator-Custom",
+ "date": "2010-07-08T13:14:23",
+ "stat": {
+ "size": 13256
+ },
+ "checksum_sha256": "40800b3d92cebc09967b61725cecdd05de2b04649f095e3034c5dd82f3d4ad89",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_id": "M_lLALu56mb_cDK_jAXwUB2PUlw",
+ "_type": "release"
+ },
+ {
+ "_id": "EVuvfiFcvtEr9Ne5Q4QoMAaxe7E",
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1203",
+ "stat": {
+ "size": 12572
+ },
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "028a0b41c152c585143167464bed2ac6b6680c8006aa80867f9a8faa4ca5efe7",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1203.tar.gz",
+ "date": "2010-07-07T13:29:41",
+ "distribution": "Validator-Custom",
+ "version": "0.1203"
+ },
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_id": "ZaT8bwXejVTHmrzZCqNJPRFImBY",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1641",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "940412af9b7faf4c946a5e4d57ca52e5b704e49c4d7d0aa5ecb6d2286477ebc6",
+ "stat": {
+ "size": 40480
+ },
+ "distribution": "DBIx-Custom",
+ "version": "0.1641",
+ "date": "2011-01-27T05:19:14",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1641.tar.gz"
+ },
+ "_score": 1.0
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-0.1646",
+ "version": "0.1646",
+ "distribution": "DBIx-Custom",
+ "date": "2011-02-18T17:48:52",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1646.tar.gz",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "checksum_sha256": "7f729311e3e22d36b158e62b42ab2fbd29f08eabd57206e235db939d1ae57d24",
+ "stat": {
+ "size": 46577
+ }
+ },
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_id": "j21QIzHRYZKz1vobyGAPa2BuO50",
+ "_type": "release"
+ }
+ ],
+ "total": 359941
+ },
+ "timed_out": false
+}
\ No newline at end of file
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
deleted file mode 100644
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==
+++ /dev/null
@@ -1,50 +0,0 @@
-{
- "_shards" : {
- "successful" : 3,
- "total" : 3,
- "failed" : 0
- },
- "timed_out" : false,
- "hits" : {
- "max_score" : 1.0,
- "hits" : [
- {
- "_type" : "distribution",
- "fields" : {
- "name" : [
- "EventSource-Server"
- ]
- },
- "_id" : "EventSource-Server",
- "_index" : "cpan_v1_01",
- "_score" : 1.0
- },
- {
- "_score" : 1.0,
- "_index" : "cpan_v1_01",
- "_id" : "Interchange6",
- "fields" : {
- "name" : [
- "Interchange6"
- ]
- },
- "_type" : "distribution"
- },
- {
- "_score" : 1.0,
- "_index" : "cpan_v1_01",
- "_id" : "Internals-CountObjects",
- "fields" : {
- "name" : [
- "Internals-CountObjects"
- ]
- },
- "_type" : "distribution"
- }
- ],
- "total" : 43675
- },
- "took" : 72,
- "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
- "terminated_early" : true
-}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
deleted file mode 100644
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1
+++ /dev/null
@@ -1,16 +0,0 @@
-{
- "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
- "took" : 1,
- "hits" : {
- "hits" : [],
- "total" : 43675,
- "max_score" : 1.0
- },
- "terminated_early" : true,
- "timed_out" : false,
- "_shards" : {
- "failed" : 0,
- "total" : 3,
- "successful" : 3
- }
-}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
deleted file mode 100644
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m
+++ /dev/null
@@ -1,52 +0,0 @@
-{
- "_shards" : {
- "successful" : 3,
- "failed" : 0,
- "total" : 3
- },
- "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==",
- "took" : 61,
- "hits" : {
- "max_score" : 1.0,
- "hits" : [
- {
- "_score" : 1.0,
- "_index" : "cpan_v1_01",
- "_id" : "openerserver_perl-master",
- "fields" : {
- "name" : "openerserver_perl-master"
- },
- "_type" : "distribution"
- },
- {
- "_score" : 1.0,
- "_type" : "distribution",
- "fields" : {
- "name" : "Getopt_Auto"
- },
- "_id" : "Getopt_Auto",
- "_index" : "cpan_v1_01"
- },
- {
- "_id" : "App-Booklist",
- "_index" : "cpan_v1_01",
- "_type" : "distribution",
- "fields" : {
- "name" : "App-Booklist"
- },
- "_score" : 1.0
- },
- {
- "fields" : {
- "name" : "EuclideanRhythm"
- },
- "_type" : "distribution",
- "_index" : "cpan_v1_01",
- "_id" : "EuclideanRhythm",
- "_score" : 1.0
- }
- ],
- "total" : 43675
- },
- "timed_out" : false
-}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
@@ -0,0 +1,246 @@
+{
+ "timed_out": false,
+ "_shards": {
+ "total": 3,
+ "failed": 0,
+ "successful": 3
+ },
+ "hits": {
+ "hits": [
+ {
+ "_index": "cpan_v1_01",
+ "_id": "40MmOvf_SQx_mr8Kj9Eush14a3E",
+ "_source": {
+ "author": "KRYDE",
+ "name": "math-image-46",
+ "date": "2011-03-02T00:46:14",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KR/KRYDE/math-image-46.tar.gz",
+ "checksum_sha256": "6bd988e3959feb1071d3b9953d16e723af66bdb7b5440ea17add8709d95f20fa",
+ "version": "46",
+ "stat": {
+ "size": 533502
+ },
+ "distribution": "math-image",
+ "metadata": {
+ "author": [
+ "Kevin Ryde <user42@zip.com.au>"
+ ]
+ }
+ },
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "MITHALDU",
+ "name": "Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL",
+ "distribution": "Dist-Zilla-Plugin-ProgCriticTests",
+ "metadata": {
+ "author": [
+ "Christian Walde <mithaldu@yahoo.de>"
+ ]
+ },
+ "stat": {
+ "size": 16918
+ },
+ "checksum_sha256": "ef8c92d0fc55551392a6daeee20a1c13a3ee1bcd0fcacf611cbc2a6cc503f401",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHALDU/Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL.tar.gz",
+ "date": "2010-06-07T14:43:36",
+ "version": "1.101580"
+ },
+ "_id": "6df77_MLO_BG8YC_vQKsay7OFYM",
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "MITHUN",
+ "name": "Net-Rapidshare-v0.04",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.04.tar.gz",
+ "date": "2009-07-28T05:57:26",
+ "checksum_sha256": "f01456a8f8c2b6806a8dd041cf848f330884573d363b28c8b3ff12e837fa8f4f",
+ "version": "v0.04",
+ "distribution": "Net-Rapidshare",
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ },
+ "stat": {
+ "size": 15068
+ }
+ },
+ "_id": "jCs3ZLWuoetrkMLOFKV3YTSr_fM",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "MITHUN",
+ "name": "Net-Rapidshare-v0.05",
+ "version": "v0.05",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.05.tgz",
+ "date": "2009-12-21T00:29:48",
+ "checksum_sha256": "e1128d3b35616530d9722d0fe3f5f0e343fd914bc8f9c0df55c1a9ad6c7402fe",
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ },
+ "distribution": "Net-Rapidshare",
+ "stat": {
+ "size": 15971
+ }
+ },
+ "_id": "pExMIwabhz_0S1rX7xAY_lq0GTY",
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_type": "release",
+ "_source": {
+ "author": "MITHUN",
+ "name": "Net-Rapidshare-v0.0.1",
+ "version": "v0.0.1",
+ "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.0.1.tar.gz",
+ "checksum_sha256": "990de0a72753fa182e7a5867e55fd6755375b71280bb7e5b3a5f07c4de8af905",
+ "date": "2009-07-18T22:56:38",
+ "stat": {
+ "size": 15161
+ },
+ "metadata": {
+ "author": [
+ "unknown"
+ ]
+ },
+ "distribution": "Net-Rapidshare"
+ },
+ "_id": "eqkhDnj0efXHisWRrMZZ1EHFgug",
+ "_index": "cpan_v1_01",
+ "_score": 1.0
+ },
+ {
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-Basic-0.0101",
+ "stat": {
+ "size": 3409
+ },
+ "distribution": "DBIx-Custom-Basic",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-Basic-0.0101.tar.gz",
+ "date": "2009-11-08T04:18:30",
+ "checksum_sha256": "86f68b2d0789934aa6b0202345e9807c5b650f8030b55d0d669ef25293fa3f1f",
+ "version": "0.0101"
+ },
+ "_id": "oKf3t0pXHXa6mZ_4sUZSaSMKuXg"
+ },
+ {
+ "_score": 1.0,
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "DBIx-Custom-SQLite-0.0101",
+ "version": "0.0101",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-SQLite-0.0101.tar.gz",
+ "date": "2009-11-08T04:20:31",
+ "checksum_sha256": "0af123551dff95f9654f4fbc24e945c5d6481b92e67b8e03ca91ef4c83088cc7",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "distribution": "DBIx-Custom-SQLite",
+ "stat": {
+ "size": 3927
+ }
+ },
+ "_type": "release",
+ "_id": "zpVA3zMoUhx0mj8Cn4YC9CuFyA8"
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-Ext-Mojolicious-0.0103",
+ "checksum_sha256": "0911fe6ae65f9173c6eb68b6116600552b088939b94881be3c7275344b1cbdce",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0103.tar.gz",
+ "date": "2010-01-16T14:51:11",
+ "version": "0.0103",
+ "stat": {
+ "size": 4190
+ },
+ "distribution": "Validator-Custom-Ext-Mojolicious",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_id": "mY_jP2O7NnTtr3utv_xZQNu10Ic",
+ "_type": "release",
+ "_score": 1.0
+ },
+ {
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-Ext-Mojolicious-0.0102",
+ "stat": {
+ "size": 4257
+ },
+ "distribution": "Validator-Custom-Ext-Mojolicious",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ },
+ "date": "2010-01-15T14:07:24",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0102.tar.gz",
+ "checksum_sha256": "a88d01504353223f7a3cb0d6a240debb9c6d6155858f1048a19007c3b366beed",
+ "version": "0.0102"
+ },
+ "_id": "WZm6hQ6mBfOqgVE6dPQOE0L8hg0",
+ "_type": "release",
+ "_index": "cpan_v1_01",
+ "_score": 1.0
+ },
+ {
+ "_index": "cpan_v1_01",
+ "_type": "release",
+ "_source": {
+ "author": "KIMOTO",
+ "name": "Validator-Custom-0.1207",
+ "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1207.tar.gz",
+ "date": "2010-07-28T13:42:23",
+ "checksum_sha256": "f599da2ecc17ac74443628eb84233ee6b25b204511f83ea778dad9efd0f558e0",
+ "version": "0.1207",
+ "stat": {
+ "size": 16985
+ },
+ "distribution": "Validator-Custom",
+ "metadata": {
+ "author": [
+ "Yuki Kimoto <kimoto.yuki@gmail.com>"
+ ]
+ }
+ },
+ "_id": "NWJOqmjEinjfJqawfpkEpEhu4d0",
+ "_score": 1.0
+ }
+ ],
+ "total": 359941,
+ "max_score": 1.0
+ },
+ "took": 14,
+ "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw=="
+}
\ No newline at end of file
diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py
--- a/swh/lister/cpan/tests/test_lister.py
+++ b/swh/lister/cpan/tests/test_lister.py
@@ -3,25 +3,95 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from collections import defaultdict
+from itertools import chain
+import json
+from pathlib import Path
+
+import pytest
+
from swh.lister.cpan.lister import CpanLister
-expected_origins = [
- "https://metacpan.org/dist/App-Booklist",
- "https://metacpan.org/dist/EuclideanRhythm",
- "https://metacpan.org/dist/EventSource-Server",
- "https://metacpan.org/dist/Getopt_Auto",
- "https://metacpan.org/dist/Interchange6",
- "https://metacpan.org/dist/Internals-CountObjects",
- "https://metacpan.org/dist/openerserver_perl-master",
-]
+@pytest.fixture
+def release_search_response(datadir):
+ return json.loads(
+ Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes()
+ )
+
+
+@pytest.fixture
+def release_scroll_first_response(datadir):
+ return json.loads(
+ Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes()
+ )
-def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler):
+
+@pytest.fixture(autouse=True)
+def mock_network_requests(
+ requests_mock, release_search_response, release_scroll_first_response
+):
+ requests_mock.get(
+ "https://fastapi.metacpan.org/v1/release/_search",
+ json=release_search_response,
+ )
+ requests_mock.get(
+ "https://fastapi.metacpan.org/v1/_search/scroll",
+ [
+ {
+ "json": release_scroll_first_response,
+ },
+ {"json": {"hits": {"hits": []}, "_scroll_id": ""}},
+ ],
+ )
+
+
+def test_cpan_lister(
+ swh_scheduler, release_search_response, release_scroll_first_response
+):
lister = CpanLister(scheduler=swh_scheduler)
res = lister.run()
- assert res.pages == 3
- assert res.origins == 4 + 3 + 0
+ expected_origins = set()
+ expected_artifacts = defaultdict(list)
+ expected_module_metadata = defaultdict(list)
+ for release in chain(
+ release_search_response["hits"]["hits"],
+ release_scroll_first_response["hits"]["hits"],
+ ):
+ distribution = release["_source"]["distribution"]
+ release_name = release["_source"]["name"]
+ checksum_sha256 = release["_source"]["checksum_sha256"]
+ download_url = release["_source"]["download_url"]
+ version = release["_source"]["version"]
+ size = release["_source"]["stat"]["size"]
+ author = release["_source"]["author"]
+ author_fullname = release["_source"]["metadata"]["author"][0]
+ date = release["_source"]["date"]
+ origin_url = f"https://metacpan.org/dist/{distribution}"
+ expected_origins.add(origin_url)
+ expected_artifacts[origin_url].append(
+ {
+ "url": download_url,
+ "filename": download_url.split("/")[-1],
+ "version": version,
+ "length": size,
+ "checksums": {"sha256": checksum_sha256},
+ }
+ )
+ expected_module_metadata[origin_url].append(
+ {
+ "name": distribution,
+ "version": version,
+ "cpan_author": author,
+ "author": author_fullname if author_fullname != "unknown" else author,
+ "date": date,
+ "release_name": release_name,
+ }
+ )
+
+ assert res.pages == 1
+ assert res.origins == len(expected_origins)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
@@ -30,3 +100,8 @@
for origin in scheduler_origins:
assert origin.visit_type == "cpan"
assert origin.url in expected_origins
+ assert origin.extra_loader_arguments == {
+ "api_base_url": "https://fastapi.metacpan.org/v1",
+ "artifacts": expected_artifacts[origin.url],
+ "module_metadata": expected_module_metadata[origin.url],
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Dec 19, 9:58 AM (18 h, 59 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231685
Attached To
D8615: cpan: Improve listing process by querying the metacpan release endpoint
Event Timeline
Log In to Comment