diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py index 23b821d..32f7479 100644 --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -1,183 +1,199 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from datetime import datetime import logging -from typing import Any, Dict, Iterator, List, Optional, Set +from typing import Any, Dict, Iterator, List, Optional, Set, Union import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CpanListerPage = Set[str] def get_field_value(entry, field_name): """ Splits ``field_name`` on ``.``, and use it as path in the nested ``entry`` dictionary. If a value does not exist, returns None. >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}} >>> get_field_value(entry, "foo") 1 >>> get_field_value(entry, "bar") {'baz': 2, 'qux': [3]} >>> get_field_value(entry, "bar.baz") 2 >>> get_field_value(entry, "bar.qux") 3 """ fields = field_name.split(".") field_value = entry["_source"] for field in fields[:-1]: field_value = field_value.get(field, {}) field_value = field_value.get(fields[-1]) # scrolled results might have field value in a list if isinstance(field_value, list): field_value = field_value[0] return field_value +def get_module_version( + module_name: str, module_version: Union[str, float, int], release_name: str +) -> str: + # some old versions fail to be parsed and cpan api set version to 0 + if module_version == 0: + prefix = f"{module_name}-" + if release_name.startswith(prefix): + # extract version from release name + module_version = release_name.replace(prefix, "", 1) + return str(module_version) + + class CpanLister(StatelessLister[CpanListerPage]): """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive Network.""" LISTER_NAME = "cpan" VISIT_TYPE = "cpan" INSTANCE = "cpan" API_BASE_URL = "https://fastapi.metacpan.org/v1" REQUIRED_DOC_FIELDS = [ "download_url", "checksum_sha256", "distribution", "version", ] OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"] ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_BASE_URL, ) self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list) self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list) self.release_dates: Dict[str, List[datetime]] = defaultdict(list) self.module_names: Set[str] = set() def process_release_page(self, page: List[Dict[str, Any]]): for entry in page: if "_source" not in entry or not all( k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS ): logger.warning( "Skipping release entry %s as some required fields are missing", entry.get("_source"), ) continue module_name = get_field_value(entry, "distribution") module_version = get_field_value(entry, "version") module_download_url = get_field_value(entry, "download_url") module_sha256_checksum = get_field_value(entry, "checksum_sha256") module_date = get_field_value(entry, "date") module_size = get_field_value(entry, "stat.size") module_author = get_field_value(entry, "author") module_author_fullname = get_field_value(entry, "metadata.author") release_name = get_field_value(entry, "name") + module_version = get_module_version( + module_name, module_version, release_name + ) + self.artifacts[module_name].append( { "url": module_download_url, "filename": module_download_url.split("/")[-1], "checksums": {"sha256": module_sha256_checksum}, "version": module_version, "length": module_size, } ) self.module_metadata[module_name].append( { "name": module_name, "version": module_version, "cpan_author": module_author, "author": ( module_author_fullname if module_author_fullname not in (None, "", "unknown") else module_author ), "date": module_date, "release_name": release_name, } ) self.release_dates[module_name].append(iso8601.parse_date(module_date)) self.module_names.add(module_name) def get_pages(self) -> Iterator[CpanListerPage]: """Yield an iterator which returns 'page'""" endpoint = f"{self.API_BASE_URL}/release/_search" scrollendpoint = f"{self.API_BASE_URL}/_search/scroll" size = 1000 res = self.http_request( endpoint, params={ "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS, "size": size, "scroll": "1m", }, ) data = res.json()["hits"]["hits"] self.process_release_page(data) _scroll_id = res.json()["_scroll_id"] while data: scroll_res = self.http_request( scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id} ) data = scroll_res.json()["hits"]["hits"] _scroll_id = scroll_res.json()["_scroll_id"] self.process_release_page(data) yield self.module_names def get_origins_from_page( self, module_names: CpanListerPage ) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for module_name in module_names: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=self.ORIGIN_URL_PATTERN.format(module_name=module_name), last_update=max(self.release_dates[module_name]), extra_loader_arguments={ "api_base_url": self.API_BASE_URL, "artifacts": self.artifacts[module_name], "module_metadata": self.module_metadata[module_name], }, ) diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page1 similarity index 100% rename from swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll rename to swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page1 diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page2 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page2 new file mode 100644 index 0000000..c98d4b1 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page2 @@ -0,0 +1,39 @@ +{ + "_shards": { + "successful": 3, + "failed": 0, + "total": 3 + }, + "hits": { + "max_score": 16.105877, + "hits": [ + { + "_id": "FM3U2W_LR4pgKJepBaDKUb4WEy0", + "_index": "cpan_v1_01", + "_type": "release", + "_source": { + "distribution": "UDPServersAndClients", + "date": "2006-04-20T00:03:25", + "checksum_sha256": "763da87c32e65cc7ff72d70a503b4e9497f6b506c174b82c97671af8667c1922", + "stat": { + "size": 5576 + }, + "author": "ROBINBANK", + "version": 0, + "download_url": "https://cpan.metacpan.org/authors/id/R/RO/ROBINBANK/UDPServersAndClients.zip", + "metadata": { + "author": [ + "unknown" + ] + }, + "name": "UDPServersAndClients" + }, + "_score": 16.105877 + } + ], + "total": 1 + }, + "took": 2, + "timed_out": false, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==" +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page3 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page3 new file mode 100644 index 0000000..59011f9 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page3 @@ -0,0 +1,85 @@ +{ + "took": 3, + "_shards": { + "successful": 3, + "failed": 0, + "total": 3 + }, + "timed_out": false, + "hits": { + "max_score": 13.962857, + "hits": [ + { + "_score": 13.962857, + "_type": "release", + "_source": { + "version": 0, + "checksum_sha256": "a19fa7e735ea3406dfeb9c72f35fb2b64fda1e8035ce6ba0fabc15ce1c1e2f41", + "metadata": { + "author": [ + "unknown" + ] + }, + "author": "MICB", + "name": "Compiler-a3", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MICB/Compiler-a3.tar.gz", + "date": "1996-09-02T14:04:00", + "stat": { + "size": 89134 + }, + "distribution": "Compiler" + }, + "_id": "aBI9p6X_yq6r9e8pk7U17pbZMPM", + "_index": "cpan_v1_01" + }, + { + "_score": 13.707853, + "_source": { + "checksum_sha256": "def01b544d23c76ec19cc2288a3295b39abcdbdea6dbded5b7fe6d17cd4525de", + "version": 0, + "name": "Compiler-a2", + "author": "MICB", + "metadata": { + "author": [ + "unknown" + ] + }, + "date": "1996-08-22T14:30:00", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MICB/Compiler-a2.tar.gz", + "distribution": "Compiler", + "stat": { + "size": 85123 + } + }, + "_type": "release", + "_id": "fG9UelWPReQei13FQ4EAHytuZCo", + "_index": "cpan_v1_01" + }, + { + "_source": { + "checksum_sha256": "b1f7afd4fa8825adf2c17a0cbd8706484e6d2da5294786a5e6e49c205708ee41", + "version": 0, + "name": "Compiler-a1", + "metadata": { + "author": [ + "unknown" + ] + }, + "author": "MICB", + "date": "1996-05-13T11:39:00", + "download_url": "https://cpan.metacpan.org/authors/id/M/MI/MICB/Compiler-a1.tar.gz", + "stat": { + "size": 61093 + }, + "distribution": "Compiler" + }, + "_type": "release", + "_id": "8H7BRLllDoyILyqsjjV8sqkBpQY", + "_index": "cpan_v1_01", + "_score": 13.572314 + } + ], + "total": 3 + }, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==" +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page4 b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page4 new file mode 100644 index 0000000..5d6b861 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page4 @@ -0,0 +1,131 @@ +{ + "timed_out": false, + "_shards": { + "failed": 0, + "total": 3, + "successful": 3 + }, + "took": 14, + "hits": { + "total": 5, + "hits": [ + { + "_score": 14.460719, + "_type": "release", + "_source": { + "stat": { + "size": 10738 + }, + "download_url": "https://cpan.metacpan.org/authors/id/F/FE/FELIPE/Call-Context-0.03-TRIAL1.tar.gz", + "distribution": "Call-Context", + "author": "FELIPE", + "version": "0.03-TRIAL1", + "checksum_sha256": "82aa854d6ae68342b58361b089c7f480b5b75e94f0c85c1d311f8cace1bfadea", + "metadata": { + "author": [ + "Felipe Gasper (FELIPE)" + ] + }, + "name": "Call-Context-0.03-TRIAL1", + "date": "2018-10-25T03:47:31" + }, + "_index": "cpan_v1_01", + "_id": "Cjw1voci7z74uflSPriBTT_A_5c" + }, + { + "_id": "VdVDByg5PHxbDh9HnvKAzf8QOws", + "_index": "cpan_v1_01", + "_source": { + "download_url": "https://cpan.metacpan.org/authors/id/F/FE/FELIPE/Call-Context-0.01.tar.gz", + "stat": { + "size": 10019 + }, + "author": "FELIPE", + "distribution": "Call-Context", + "version": 0.01, + "date": "2016-11-12T23:12:54", + "checksum_sha256": "21bf762ef5b3cbf1047192c2a3c499e9bd315b11e5530bd133856cdf87187b24", + "name": "Call-Context-0.01", + "metadata": { + "author": [ + "Felipe Gasper (FELIPE)" + ] + } + }, + "_type": "release", + "_score": 14.460719 + }, + { + "_score": 14.314282, + "_id": "_MA6FD8SOhOmTG8JUhvl3CN186I", + "_type": "release", + "_source": { + "stat": { + "size": 10046 + }, + "download_url": "https://cpan.metacpan.org/authors/id/F/FE/FELIPE/Call-Context-0.02.tar.gz", + "distribution": "Call-Context", + "author": "FELIPE", + "version": 0.02, + "metadata": { + "author": [ + "Felipe Gasper (FELIPE)" + ] + }, + "checksum_sha256": "b80d977f1df0e08bda2808124cd7218ad83f802e1a54aa258e17748ff5c02a0a", + "name": "Call-Context-0.02", + "date": "2016-11-13T01:07:43" + }, + "_index": "cpan_v1_01" + }, + { + "_id": "veMmCu9wirwpTX7czbuQq6SnKQQ", + "_type": "release", + "_source": { + "stat": { + "size": 10741 + }, + "download_url": "https://cpan.metacpan.org/authors/id/F/FE/FELIPE/Call-Context-0.03-TRIAL2.tar.gz", + "distribution": "Call-Context", + "author": "FELIPE", + "version": "0.03-TRIAL2", + "name": "Call-Context-0.03-TRIAL2", + "metadata": { + "author": [ + "Felipe Gasper (FELIPE)" + ] + }, + "checksum_sha256": "4ca799d81fc96a774f4f315c38eb3e53616322c332d47f1e3f756814b5bf4b5e", + "date": "2018-10-26T13:56:41" + }, + "_index": "cpan_v1_01", + "_score": 14.291793 + }, + { + "_type": "release", + "_source": { + "version": "0.03", + "date": "2018-10-27T00:20:13", + "checksum_sha256": "0ee6bf46bc72755adb7a6b08e79d12e207de5f7809707b3c353b58cb2f0b5a26", + "metadata": { + "author": [ + "Felipe Gasper (FELIPE)" + ] + }, + "name": "Call-Context-0.03", + "download_url": "https://cpan.metacpan.org/authors/id/F/FE/FELIPE/Call-Context-0.03.tar.gz", + "stat": { + "size": 10730 + }, + "author": "FELIPE", + "distribution": "Call-Context" + }, + "_index": "cpan_v1_01", + "_id": "CAAVfGh_7XpKnzpnLVaBKg8IPMM", + "_score": 14.291793 + } + ], + "max_score": 14.460719 + }, + "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==" +} \ No newline at end of file diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py index 9e7950c..46453c0 100644 --- a/swh/lister/cpan/tests/test_lister.py +++ b/swh/lister/cpan/tests/test_lister.py @@ -1,107 +1,171 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from itertools import chain import json from pathlib import Path import pytest -from swh.lister.cpan.lister import CpanLister +from swh.lister.cpan.lister import CpanLister, get_module_version @pytest.fixture def release_search_response(datadir): return json.loads( Path(datadir, "https_fastapi.metacpan.org", "v1_release__search").read_bytes() ) -@pytest.fixture -def release_scroll_first_response(datadir): +def release_scroll_response(datadir, page): return json.loads( - Path(datadir, "https_fastapi.metacpan.org", "v1__search_scroll").read_bytes() + Path( + datadir, "https_fastapi.metacpan.org", f"v1__search_scroll_page{page}" + ).read_bytes() ) +@pytest.fixture +def release_scroll_first_response(datadir): + return release_scroll_response(datadir, page=1) + + +@pytest.fixture +def release_scroll_second_response(datadir): + return release_scroll_response(datadir, page=2) + + +@pytest.fixture +def release_scroll_third_response(datadir): + return release_scroll_response(datadir, page=3) + + +@pytest.fixture +def release_scroll_fourth_response(datadir): + return release_scroll_response(datadir, page=4) + + @pytest.fixture(autouse=True) def mock_network_requests( - requests_mock, release_search_response, release_scroll_first_response + requests_mock, + release_search_response, + release_scroll_first_response, + release_scroll_second_response, + release_scroll_third_response, + release_scroll_fourth_response, ): requests_mock.get( "https://fastapi.metacpan.org/v1/release/_search", json=release_search_response, ) requests_mock.get( "https://fastapi.metacpan.org/v1/_search/scroll", [ { "json": release_scroll_first_response, }, + { + "json": release_scroll_second_response, + }, + { + "json": release_scroll_third_response, + }, + { + "json": release_scroll_fourth_response, + }, {"json": {"hits": {"hits": []}, "_scroll_id": ""}}, ], ) +@pytest.mark.parametrize( + "module_name,module_version,release_name,expected_version", + [ + ("Validator-Custom", "0.1207", "Validator-Custom-0.1207", "0.1207"), + ("UDPServersAndClients", 0, "UDPServersAndClients", "0"), + ("Compiler", 0, "Compiler-a1", "a1"), + ("Call-Context", 0.01, "Call-Context-0.01", "0.01"), + ], +) +def test_get_module_version( + module_name, module_version, release_name, expected_version +): + assert ( + get_module_version(module_name, module_version, release_name) + == expected_version + ) + + def test_cpan_lister( - swh_scheduler, release_search_response, release_scroll_first_response + swh_scheduler, + release_search_response, + release_scroll_first_response, + release_scroll_second_response, + release_scroll_third_response, + release_scroll_fourth_response, ): lister = CpanLister(scheduler=swh_scheduler) res = lister.run() expected_origins = set() expected_artifacts = defaultdict(list) expected_module_metadata = defaultdict(list) for release in chain( release_search_response["hits"]["hits"], release_scroll_first_response["hits"]["hits"], + release_scroll_second_response["hits"]["hits"], + release_scroll_third_response["hits"]["hits"], + release_scroll_fourth_response["hits"]["hits"], ): distribution = release["_source"]["distribution"] release_name = release["_source"]["name"] checksum_sha256 = release["_source"]["checksum_sha256"] download_url = release["_source"]["download_url"] version = release["_source"]["version"] size = release["_source"]["stat"]["size"] author = release["_source"]["author"] author_fullname = release["_source"]["metadata"]["author"][0] date = release["_source"]["date"] origin_url = f"https://metacpan.org/dist/{distribution}" + + version = get_module_version(distribution, version, release_name) + expected_origins.add(origin_url) expected_artifacts[origin_url].append( { "url": download_url, "filename": download_url.split("/")[-1], "version": version, "length": size, "checksums": {"sha256": checksum_sha256}, } ) expected_module_metadata[origin_url].append( { "name": distribution, "version": version, "cpan_author": author, "author": author_fullname if author_fullname != "unknown" else author, "date": date, "release_name": release_name, } ) assert res.pages == 1 assert res.origins == len(expected_origins) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(expected_origins) for origin in scheduler_origins: assert origin.visit_type == "cpan" assert origin.url in expected_origins assert origin.extra_loader_arguments == { "api_base_url": "https://fastapi.metacpan.org/v1", "artifacts": expected_artifacts[origin.url], "module_metadata": expected_module_metadata[origin.url], }