diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py --- a/swh/lister/cpan/__init__.py +++ b/swh/lister/cpan/__init__.py @@ -16,9 +16,9 @@ Origins retrieving strategy --------------------------- -To get a list of all package names we call a first `http api endpoint`_ that -retrieve results and a ``_scroll_id`` that will be used to scroll pages through -`search`_ endpoint. +To get a list of all package names and their associated release artifacts we call +a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will +be used to scroll pages through `search`_ endpoint. Page listing ------------ @@ -57,7 +57,7 @@ .. _cpan.org: https://cpan.org/ .. _metacpan.org: https://metacpan.org/ -.. _http api endpoint: https://explorer.metacpan.org/?url=/distribution/ +.. _http api endpoint: https://explorer.metacpan.org/?url=/release/ .. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950 diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -3,8 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict +from datetime import datetime import logging -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Set + +import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -14,7 +18,33 @@ logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. -CpanListerPage = List[Dict[str, Any]] +CpanListerPage = Set[str] + + +def get_field_value(entry, field_name): + """ + Splits ``field_name`` on ``.``, and use it as path in the nested ``entry`` + dictionary. If a value does not exist, returns None. + + >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}} + >>> get_field_value(entry, "foo") + 1 + >>> get_field_value(entry, "bar") + {'baz': 2, 'qux': [3]} + >>> get_field_value(entry, "bar.baz") + 2 + >>> get_field_value(entry, "bar.qux") + 3 + """ + fields = field_name.split(".") + field_value = entry["_source"] + for field in fields[:-1]: + field_value = field_value.get(field, {}) + field_value = field_value.get(fields[-1]) + # scrolled results might have field value in a list + if isinstance(field_value, list): + field_value = field_value[0] + return field_value class CpanLister(StatelessLister[CpanListerPage]): @@ -25,7 +55,15 @@ VISIT_TYPE = "cpan" INSTANCE = "cpan" - BASE_URL = "https://fastapi.metacpan.org/v1/" + API_BASE_URL = "https://fastapi.metacpan.org/v1" + REQUIRED_DOC_FIELDS = [ + "download_url", + "checksum_sha256", + "distribution", + "version", + ] + OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] + ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}" def __init__( self, @@ -36,26 +74,82 @@ scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, - url=self.BASE_URL, + url=self.API_BASE_URL, ) + self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + self.release_dates: Dict[str, List[datetime]] = defaultdict(list) + self.module_names: Set[str] = set() + + def process_release_page(self, page: List[Dict[str, Any]]): + for entry in page: + + if "_source" not in entry or not all( + k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS + ): + logger.warning( + "Skipping release entry %s as some required fields are missing", + entry.get("_source"), + ) + continue + + module_name = get_field_value(entry, "distribution") + module_version = get_field_value(entry, "version") + module_download_url = get_field_value(entry, "download_url") + module_sha256_checksum = get_field_value(entry, "checksum_sha256") + module_date = get_field_value(entry, "date") + module_size = get_field_value(entry, "stat.size") + module_author = get_field_value(entry, "author") + module_author_fullname = get_field_value(entry, "metadata.author") + release_name = get_field_value(entry, "name") + + self.artifacts[module_name].append( + { + "url": module_download_url, + "filename": module_download_url.split("/")[-1], + "checksums": {"sha256": module_sha256_checksum}, + "version": module_version, + "length": module_size, + } + ) + + self.module_metadata[module_name].append( + { + "name": module_name, + "version": module_version, + "cpan_author": module_author, + "author": ( + module_author_fullname + if module_author_fullname not in (None, "", "unknown") + else module_author + ), + "date": module_date, + "release_name": release_name, + } + ) + + self.release_dates[module_name].append(iso8601.parse_date(module_date)) + + self.module_names.add(module_name) + def get_pages(self) -> Iterator[CpanListerPage]: """Yield an iterator which returns 'page'""" - endpoint = f"{self.BASE_URL}distribution/_search" - scrollendpoint = f"{self.BASE_URL}_search/scroll" - size: int = 1000 + endpoint = f"{self.API_BASE_URL}/release/_search" + scrollendpoint = f"{self.API_BASE_URL}/_search/scroll" + size = 1000 res = self.http_request( endpoint, params={ - "fields": ["name"], + "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS, "size": size, "scroll": "1m", }, ) data = res.json()["hits"]["hits"] - yield data + self.process_release_page(data) _scroll_id = res.json()["_scroll_id"] @@ -65,27 +159,25 @@ ) data = scroll_res.json()["hits"]["hits"] _scroll_id = scroll_res.json()["_scroll_id"] - yield data + self.process_release_page(data) - def get_origins_from_page(self, page: CpanListerPage) -> Iterator[ListedOrigin]: + yield self.module_names + + def get_origins_from_page( + self, module_names: CpanListerPage + ) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None - for entry in page: - # Skip the entry if 'fields' or 'name' keys are missing - if "fields" not in entry or "name" not in entry["fields"]: - continue - - pkgname = entry["fields"]["name"] - # TODO: Check why sometimes its a one value list - if type(pkgname) != str: - pkgname = pkgname[0] - - url = f"https://metacpan.org/dist/{pkgname}" - + for module_name in module_names: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=url, - last_update=None, + url=self.ORIGIN_URL_PATTERN.format(module_name=module_name), + last_update=max(self.release_dates[module_name]), + extra_loader_arguments={ + "api_base_url": self.API_BASE_URL, + "artifacts": self.artifacts[module_name], + "module_metadata": self.module_metadata[module_name], + }, ) diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll @@ -0,0 +1,247 @@ +{ + "_shards": { + "failed": 0, + "total": 3, + "successful": 3 + }, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", + "terminated_early": true, + "took": 3, + "hits": { + "max_score": 1.0, + "hits": [ + { + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1210", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1210.tar.gz", + "version": "0.1210", + "distribution": "Validator-Custom", + "date": "2010-08-14T01:41:56", + "stat": { + "size": 17608 + }, + "checksum_sha256": "f7240f7793ced2952701f0ed28ecf43c07cc2fa4549cc505831eceb8424cba4a", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_type": "release", + "_index": "cpan_v1_01", + "_id": "VGApYqMT4TCxUzHcITn8ZhGHlxE" + }, + { + "_type": "release", + "_id": "ilQN4bpIIdRl6DoiB3y47fgNIk8", + "_index": "cpan_v1_01", + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1208", + "date": "2010-07-28T23:00:52", + "distribution": "Validator-Custom", + "version": "0.1208", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1208.tar.gz", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "e33a860b026cad852eb919da4a3645007b47e5f414eb7272534b10cee279b52b", + "stat": { + "size": 17489 + } + } + }, + { + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1619", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1619.tar.gz", + "version": "0.1619", + "distribution": "DBIx-Custom", + "date": "2010-10-20T15:01:35", + "stat": { + "size": 27195 + }, + "checksum_sha256": "83c295343f48ebc03029139082345c93527ffe5831820f99e4a72ee67ef186a5", + "metadata": { + "author": [ + "unknown" + ] + } + }, + "_score": 1.0, + "_id": "g7562_4h9d693lxvc_cgEOTJAZk", + "_index": "cpan_v1_01", + "_type": "release" + }, + { + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1401", + "version": "0.1401", + "distribution": "DBIx-Custom", + "date": "2010-05-01T23:29:22", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1401.tar.gz", + "checksum_sha256": "004be1d48b6819941b3cb3c53bf457799d811348e0bb15e7cf18211505637aba", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "stat": { + "size": 22711 + } + }, + "_type": "release", + "_id": "bLRsOH2sevNQ6Q93exgkvCZONo0", + "_index": "cpan_v1_01" + }, + { + "_type": "release", + "_index": "cpan_v1_01", + "_id": "D8L3qWKznn0IQZrZEeDi9uyXbJY", + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1301", + "stat": { + "size": 22655 + }, + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "6b39e3ad2bc98f06af3a75c96cd8c056a25f7501ed216a375472c8fe7bbb72be", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1301.tar.gz", + "distribution": "DBIx-Custom", + "version": "0.1301", + "date": "2010-05-01T13:02:19" + } + }, + { + "_score": 1.0, + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1602", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "7a7e18514e171a6c55ef4c8aef92bd548b15ffd7dec4c1fdc83c276a032f6b8a", + "stat": { + "size": 18999 + }, + "date": "2010-06-25T12:11:33", + "distribution": "DBIx-Custom", + "version": "0.1602", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1602.tar.gz" + }, + "_type": "release", + "_index": "cpan_v1_01", + "_id": "kmzgsMLGdsuiHjrSW55lLwMRO4o" + }, + { + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1204", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1204.tar.gz", + "version": "0.1204", + "distribution": "Validator-Custom", + "date": "2010-07-08T13:14:23", + "stat": { + "size": 13256 + }, + "checksum_sha256": "40800b3d92cebc09967b61725cecdd05de2b04649f095e3034c5dd82f3d4ad89", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_score": 1.0, + "_index": "cpan_v1_01", + "_id": "M_lLALu56mb_cDK_jAXwUB2PUlw", + "_type": "release" + }, + { + "_id": "EVuvfiFcvtEr9Ne5Q4QoMAaxe7E", + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1203", + "stat": { + "size": 12572 + }, + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "028a0b41c152c585143167464bed2ac6b6680c8006aa80867f9a8faa4ca5efe7", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1203.tar.gz", + "date": "2010-07-07T13:29:41", + "distribution": "Validator-Custom", + "version": "0.1203" + }, + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_id": "ZaT8bwXejVTHmrzZCqNJPRFImBY", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1641", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "940412af9b7faf4c946a5e4d57ca52e5b704e49c4d7d0aa5ecb6d2286477ebc6", + "stat": { + "size": 40480 + }, + "distribution": "DBIx-Custom", + "version": "0.1641", + "date": "2011-01-27T05:19:14", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1641.tar.gz" + }, + "_score": 1.0 + }, + { + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-0.1646", + "version": "0.1646", + "distribution": "DBIx-Custom", + "date": "2011-02-18T17:48:52", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-0.1646.tar.gz", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "checksum_sha256": "7f729311e3e22d36b158e62b42ab2fbd29f08eabd57206e235db939d1ae57d24", + "stat": { + "size": 46577 + } + }, + "_score": 1.0, + "_index": "cpan_v1_01", + "_id": "j21QIzHRYZKz1vobyGAPa2BuO50", + "_type": "release" + } + ], + "total": 359941 + }, + "timed_out": false +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== deleted file mode 100644 --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw== +++ /dev/null @@ -1,50 +0,0 @@ -{ - "_shards" : { - "successful" : 3, - "total" : 3, - "failed" : 0 - }, - "timed_out" : false, - "hits" : { - "max_score" : 1.0, - "hits" : [ - { - "_type" : "distribution", - "fields" : { - "name" : [ - "EventSource-Server" - ] - }, - "_id" : "EventSource-Server", - "_index" : "cpan_v1_01", - "_score" : 1.0 - }, - { - "_score" : 1.0, - "_index" : "cpan_v1_01", - "_id" : "Interchange6", - "fields" : { - "name" : [ - "Interchange6" - ] - }, - "_type" : "distribution" - }, - { - "_score" : 1.0, - "_index" : "cpan_v1_01", - "_id" : "Internals-CountObjects", - "fields" : { - "name" : [ - "Internals-CountObjects" - ] - }, - "_type" : "distribution" - } - ], - "total" : 43675 - }, - "took" : 72, - "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", - "terminated_early" : true -} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 deleted file mode 100644 --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll,scroll=1m,scroll_id=cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==_visit1 +++ /dev/null @@ -1,16 +0,0 @@ -{ - "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", - "took" : 1, - "hits" : { - "hits" : [], - "total" : 43675, - "max_score" : 1.0 - }, - "terminated_early" : true, - "timed_out" : false, - "_shards" : { - "failed" : 0, - "total" : 3, - "successful" : 3 - } -} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m deleted file mode 100644 --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_distribution__search,fields=name,size=1000,scroll=1m +++ /dev/null @@ -1,52 +0,0 @@ -{ - "_shards" : { - "successful" : 3, - "failed" : 0, - "total" : 3 - }, - "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7Mzs5NTU1MTQ1NTk6eXptdmszQUNUam1XbVJjRjRkRk9Udzs5NTQ5NjQ5NjI6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5NTU1MTQ1NjA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==", - "took" : 61, - "hits" : { - "max_score" : 1.0, - "hits" : [ - { - "_score" : 1.0, - "_index" : "cpan_v1_01", - "_id" : "openerserver_perl-master", - "fields" : { - "name" : "openerserver_perl-master" - }, - "_type" : "distribution" - }, - { - "_score" : 1.0, - "_type" : "distribution", - "fields" : { - "name" : "Getopt_Auto" - }, - "_id" : "Getopt_Auto", - "_index" : "cpan_v1_01" - }, - { - "_id" : "App-Booklist", - "_index" : "cpan_v1_01", - "_type" : "distribution", - "fields" : { - "name" : "App-Booklist" - }, - "_score" : 1.0 - }, - { - "fields" : { - "name" : "EuclideanRhythm" - }, - "_type" : "distribution", - "_index" : "cpan_v1_01", - "_id" : "EuclideanRhythm", - "_score" : 1.0 - } - ], - "total" : 43675 - }, - "timed_out" : false -} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search @@ -0,0 +1,246 @@ +{ + "timed_out": false, + "_shards": { + "total": 3, + "failed": 0, + "successful": 3 + }, + "hits": { + "hits": [ + { + "_index": "cpan_v1_01", + "_id": "40MmOvf_SQx_mr8Kj9Eush14a3E", + "_source": { + "author": "KRYDE", + "name": "math-image-46", + "date": "2011-03-02T00:46:14", + "download_url": "https://cpan.metacpan.org/authors/id/K/KR/KRYDE/math-image-46.tar.gz", + "checksum_sha256": "6bd988e3959feb1071d3b9953d16e723af66bdb7b5440ea17add8709d95f20fa", + "version": "46", + "stat": { + "size": 533502 + }, + "distribution": "math-image", + "metadata": { + "author": [ + "Kevin Ryde " + ] + } + }, + "_type": "release", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_source": { + "author": "MITHALDU", + "name": "Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL", + "distribution": "Dist-Zilla-Plugin-ProgCriticTests", + "metadata": { + "author": [ + "Christian Walde " + ] + }, + "stat": { + "size": 16918 + }, + "checksum_sha256": "ef8c92d0fc55551392a6daeee20a1c13a3ee1bcd0fcacf611cbc2a6cc503f401", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHALDU/Dist-Zilla-Plugin-ProgCriticTests-1.101580-TRIAL.tar.gz", + "date": "2010-06-07T14:43:36", + "version": "1.101580" + }, + "_id": "6df77_MLO_BG8YC_vQKsay7OFYM", + "_type": "release", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "MITHUN", + "name": "Net-Rapidshare-v0.04", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.04.tar.gz", + "date": "2009-07-28T05:57:26", + "checksum_sha256": "f01456a8f8c2b6806a8dd041cf848f330884573d363b28c8b3ff12e837fa8f4f", + "version": "v0.04", + "distribution": "Net-Rapidshare", + "metadata": { + "author": [ + "unknown" + ] + }, + "stat": { + "size": 15068 + } + }, + "_id": "jCs3ZLWuoetrkMLOFKV3YTSr_fM", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_source": { + "author": "MITHUN", + "name": "Net-Rapidshare-v0.05", + "version": "v0.05", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.05.tgz", + "date": "2009-12-21T00:29:48", + "checksum_sha256": "e1128d3b35616530d9722d0fe3f5f0e343fd914bc8f9c0df55c1a9ad6c7402fe", + "metadata": { + "author": [ + "unknown" + ] + }, + "distribution": "Net-Rapidshare", + "stat": { + "size": 15971 + } + }, + "_id": "pExMIwabhz_0S1rX7xAY_lq0GTY", + "_type": "release", + "_score": 1.0 + }, + { + "_type": "release", + "_source": { + "author": "MITHUN", + "name": "Net-Rapidshare-v0.0.1", + "version": "v0.0.1", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MITHUN/Net-Rapidshare-v0.0.1.tar.gz", + "checksum_sha256": "990de0a72753fa182e7a5867e55fd6755375b71280bb7e5b3a5f07c4de8af905", + "date": "2009-07-18T22:56:38", + "stat": { + "size": 15161 + }, + "metadata": { + "author": [ + "unknown" + ] + }, + "distribution": "Net-Rapidshare" + }, + "_id": "eqkhDnj0efXHisWRrMZZ1EHFgug", + "_index": "cpan_v1_01", + "_score": 1.0 + }, + { + "_score": 1.0, + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-Basic-0.0101", + "stat": { + "size": 3409 + }, + "distribution": "DBIx-Custom-Basic", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-Basic-0.0101.tar.gz", + "date": "2009-11-08T04:18:30", + "checksum_sha256": "86f68b2d0789934aa6b0202345e9807c5b650f8030b55d0d669ef25293fa3f1f", + "version": "0.0101" + }, + "_id": "oKf3t0pXHXa6mZ_4sUZSaSMKuXg" + }, + { + "_score": 1.0, + "_index": "cpan_v1_01", + "_source": { + "author": "KIMOTO", + "name": "DBIx-Custom-SQLite-0.0101", + "version": "0.0101", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/DBIx-Custom-SQLite-0.0101.tar.gz", + "date": "2009-11-08T04:20:31", + "checksum_sha256": "0af123551dff95f9654f4fbc24e945c5d6481b92e67b8e03ca91ef4c83088cc7", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "distribution": "DBIx-Custom-SQLite", + "stat": { + "size": 3927 + } + }, + "_type": "release", + "_id": "zpVA3zMoUhx0mj8Cn4YC9CuFyA8" + }, + { + "_index": "cpan_v1_01", + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-Ext-Mojolicious-0.0103", + "checksum_sha256": "0911fe6ae65f9173c6eb68b6116600552b088939b94881be3c7275344b1cbdce", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0103.tar.gz", + "date": "2010-01-16T14:51:11", + "version": "0.0103", + "stat": { + "size": 4190 + }, + "distribution": "Validator-Custom-Ext-Mojolicious", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_id": "mY_jP2O7NnTtr3utv_xZQNu10Ic", + "_type": "release", + "_score": 1.0 + }, + { + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-Ext-Mojolicious-0.0102", + "stat": { + "size": 4257 + }, + "distribution": "Validator-Custom-Ext-Mojolicious", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + }, + "date": "2010-01-15T14:07:24", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-Ext-Mojolicious-0.0102.tar.gz", + "checksum_sha256": "a88d01504353223f7a3cb0d6a240debb9c6d6155858f1048a19007c3b366beed", + "version": "0.0102" + }, + "_id": "WZm6hQ6mBfOqgVE6dPQOE0L8hg0", + "_type": "release", + "_index": "cpan_v1_01", + "_score": 1.0 + }, + { + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "author": "KIMOTO", + "name": "Validator-Custom-0.1207", + "download_url": "https://cpan.metacpan.org/authors/id/K/KI/KIMOTO/Validator-Custom-0.1207.tar.gz", + "date": "2010-07-28T13:42:23", + "checksum_sha256": "f599da2ecc17ac74443628eb84233ee6b25b204511f83ea778dad9efd0f558e0", + "version": "0.1207", + "stat": { + "size": 16985 + }, + "distribution": "Validator-Custom", + "metadata": { + "author": [ + "Yuki Kimoto " + ] + } + }, + "_id": "NWJOqmjEinjfJqawfpkEpEhu4d0", + "_score": 1.0 + } + ], + "total": 359941, + "max_score": 1.0 + }, + "took": 14, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==" +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py --- a/swh/lister/cpan/tests/test_lister.py +++ b/swh/lister/cpan/tests/test_lister.py @@ -3,25 +3,95 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict +from itertools import chain +import json +from pathlib import Path + +import pytest + from swh.lister.cpan.lister import CpanLister -expected_origins = [ - "https://metacpan.org/dist/App-Booklist", - "https://metacpan.org/dist/EuclideanRhythm", - "https://metacpan.org/dist/EventSource-Server", - "https://metacpan.org/dist/Getopt_Auto", - "https://metacpan.org/dist/Interchange6", - "https://metacpan.org/dist/Internals-CountObjects", - "https://metacpan.org/dist/openerserver_perl-master", -] +@pytest.fixture +def release_search_response(datadir): + return json.loads( + Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes() + ) + + +@pytest.fixture +def release_scroll_first_response(datadir): + return json.loads( + Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes() + ) -def test_cpan_lister(datadir, requests_mock_datadir_visits, swh_scheduler): + +@pytest.fixture(autouse=True) +def mock_network_requests( + requests_mock, release_search_response, release_scroll_first_response +): + requests_mock.get( + "https://fastapi.metacpan.org/v1/release/_search", + json=release_search_response, + ) + requests_mock.get( + "https://fastapi.metacpan.org/v1/_search/scroll", + [ + { + "json": release_scroll_first_response, + }, + {"json": {"hits": {"hits": []}, "_scroll_id": ""}}, + ], + ) + + +def test_cpan_lister( + swh_scheduler, release_search_response, release_scroll_first_response +): lister = CpanLister(scheduler=swh_scheduler) res = lister.run() - assert res.pages == 3 - assert res.origins == 4 + 3 + 0 + expected_origins = set() + expected_artifacts = defaultdict(list) + expected_module_metadata = defaultdict(list) + for release in chain( + release_search_response["hits"]["hits"], + release_scroll_first_response["hits"]["hits"], + ): + distribution = release["_source"]["distribution"] + release_name = release["_source"]["name"] + checksum_sha256 = release["_source"]["checksum_sha256"] + download_url = release["_source"]["download_url"] + version = release["_source"]["version"] + size = release["_source"]["stat"]["size"] + author = release["_source"]["author"] + author_fullname = release["_source"]["metadata"]["author"][0] + date = release["_source"]["date"] + origin_url = f"https://metacpan.org/dist/{distribution}" + expected_origins.add(origin_url) + expected_artifacts[origin_url].append( + { + "url": download_url, + "filename": download_url.split("/")[-1], + "version": version, + "length": size, + "checksums": {"sha256": checksum_sha256}, + } + ) + expected_module_metadata[origin_url].append( + { + "name": distribution, + "version": version, + "cpan_author": author, + "author": author_fullname if author_fullname != "unknown" else author, + "date": date, + "release_name": release_name, + } + ) + + assert res.pages == 1 + assert res.origins == len(expected_origins) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results @@ -30,3 +100,8 @@ for origin in scheduler_origins: assert origin.visit_type == "cpan" assert origin.url in expected_origins + assert origin.extra_loader_arguments == { + "api_base_url": "https://fastapi.metacpan.org/v1", + "artifacts": expected_artifacts[origin.url], + "module_metadata": expected_module_metadata[origin.url], + }