diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py --- a/swh/lister/cpan/__init__.py +++ b/swh/lister/cpan/__init__.py @@ -20,6 +20,10 @@ a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will be used to scroll pages through `search`_ endpoint. +The lister is incremental, it stores the UTC date the lister has been executed as +``lister.state.last_listing_date``. When present that value is used to filter results +which have an UTC `date`_ greater or equal. + Page listing ------------ @@ -58,10 +62,10 @@ .. _cpan.org: https://cpan.org/ .. _metacpan.org: https://metacpan.org/ .. _http api endpoint: https://explorer.metacpan.org/?url=/release/ -.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950 - +.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints +.. _date: https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html#date-params -""" +""" # noqa: B950 def register(): diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -4,7 +4,9 @@ # See top-level LICENSE file for more information from collections import defaultdict +from dataclasses import dataclass from datetime import datetime +import json import logging from typing import Any, Dict, Iterator, List, Optional, Set, Union @@ -13,14 +15,23 @@ from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) + # Aliasing the page results returned by `get_pages` method from the lister. CpanListerPage = Set[str] +@dataclass +class CpanListerState: + """Store lister state for incremental mode operations""" + + last_listing_date: Optional[datetime] = None + """Last date when Cpan lister was executed""" + + def get_field_value(entry, field_name): """ Splits ``field_name`` on ``.``, and use it as path in the nested ``entry`` @@ -59,7 +70,7 @@ return str(module_version) -class CpanLister(StatelessLister[CpanListerPage]): +class CpanLister(Lister[CpanListerState, CpanListerPage]): """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive Network.""" @@ -93,6 +104,21 @@ self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list) self.release_dates: Dict[str, List[datetime]] = defaultdict(list) self.module_names: Set[str] = set() + # Store the datetime the lister runs for incremental purpose + self.listing_date = datetime.now() + + def state_from_dict(self, d: Dict[str, Any]) -> CpanListerState: + last_listing_date = d.get("last_listing_date") + if last_listing_date is not None: + d["last_listing_date"] = iso8601.parse_date(last_listing_date) + return CpanListerState(**d) + + def state_to_dict(self, state: CpanListerState) -> Dict[str, Any]: + d: Dict[str, Optional[str]] = {"last_listing_date": None} + last_listing_date = state.last_listing_date + if last_listing_date is not None: + d["last_listing_date"] = last_listing_date.isoformat() + return d def process_release_page(self, page: List[Dict[str, Any]]): for entry in page: @@ -155,14 +181,28 @@ endpoint = f"{self.API_BASE_URL}/release/_search" scrollendpoint = f"{self.API_BASE_URL}/_search/scroll" size = 1000 + params = { + "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS, + "size": size, + "scroll": "1m", + } + + if self.state.last_listing_date: + # Incremental mode + # Add a query entry to params that will act as a greater or equal date filter + params["source"] = json.dumps( + { + "query": { + "range": { + "date": {"gte": self.state.last_listing_date.isoformat()} + } + } + } + ) res = self.http_request( endpoint, - params={ - "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS, - "size": size, - "scroll": "1m", - }, + params=params, ) data = res.json()["hits"]["hits"] self.process_release_page(data) @@ -197,3 +237,7 @@ "module_metadata": self.module_metadata[module_name], }, ) + + def finalize(self) -> None: + self.state.last_listing_date = self.listing_date + self.updated = True diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search --- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search @@ -243,4 +243,4 @@ }, "took": 14, "_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw==" -} \ No newline at end of file +} diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental new file mode 100644 --- /dev/null +++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental @@ -0,0 +1,85 @@ +{ + "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7MzsxOTk5Mzc4ODE6SUVISWZqMlBRTUdUbDU4c2l3R3ZBQTsxOTk5Mzc4ODI6SUVISWZqMlBRTUdUbDU4c2l3R3ZBQTsxMjk2ODI1MTYzOnl6bXZrM0FDVGptV21SY0Y0ZEZPVHc7MDs=", + "hits" : { + "total" : 3, + "hits" : [ + { + "_index" : "cpan_v1_01", + "_id" : "OeC8IpwwhBktkazmdTcwIiKng4c", + "_score" : 1.0, + "_type" : "release", + "_source" : { + "checksum_sha256" : "0969535a1df17f570c16949d730eae2df6bb51d3140fa99416377e7b798ead62", + "author" : "PERLANCAR", + "name" : "Pod-Weaver-Plugin-Data-Sah-Filter-0.007", + "distribution" : "Pod-Weaver-Plugin-Data-Sah-Filter", + "date" : "2022-11-08T00:05:36", + "stat" : { + "size" : 17164 + }, + "version" : "0.007", + "download_url" : "https://cpan.metacpan.org/authors/id/P/PE/PERLANCAR/Pod-Weaver-Plugin-Data-Sah-Filter-0.007.tar.gz", + "metadata" : { + "author" : [ + "perlancar " + ] + } + } + }, + { + "_id" : "7AsfCaXwRnHlwkLbf90_Sxu2Ua8", + "_index" : "cpan_v1_01", + "_type" : "release", + "_score" : 1.0, + "_source" : { + "checksum_sha256" : "84a523b2e2d7459736442243f3d7c5833e08b267ab07eaa335bd25171429fc9c", + "author" : "SKIM", + "distribution" : "App-Kramerius-To-Images", + "name" : "App-Kramerius-To-Images-0.04", + "date" : "2022-11-08T20:24:20", + "stat" : { + "size" : 29052 + }, + "version" : "0.04", + "download_url" : "https://cpan.metacpan.org/authors/id/S/SK/SKIM/App-Kramerius-To-Images-0.04.tar.gz", + "metadata" : { + "author" : [ + "Michal Josef Spacek " + ] + } + } + }, + { + "_source" : { + "download_url" : "https://cpan.metacpan.org/authors/id/D/DE/DERIV/Math-Business-Lookback-0.01.tar.gz", + "metadata" : { + "author" : [ + "binary.com " + ] + }, + "date" : "2022-11-09T06:37:09", + "name" : "Math-Business-Lookback-0.01", + "author" : "DERIV", + "distribution" : "Math-Business-Lookback", + "checksum_sha256" : "c8d4addf4b57033acb37980ac0652e8f80b325d17210927b10acb297c8840447", + "version" : "0.01", + "stat" : { + "size" : 18852 + } + }, + "_type" : "release", + "_score" : 1.0, + "_index" : "cpan_v1_01", + "_id" : "Q_Ahjx4TqbFCqjpagPjPjCG9DjI" + } + ], + "max_score" : 1.0 + }, + "_shards" : { + "failed" : 0, + "successful" : 3, + "total" : 3 + }, + "timed_out" : false, + "took" : 12 +} diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py --- a/swh/lister/cpan/tests/test_lister.py +++ b/swh/lister/cpan/tests/test_lister.py @@ -4,9 +4,11 @@ # See top-level LICENSE file for more information from collections import defaultdict +from datetime import datetime, timezone from itertools import chain import json from pathlib import Path +import re import pytest @@ -48,6 +50,15 @@ return release_scroll_response(datadir, page=4) +@pytest.fixture +def incremental_release_search_response(datadir): + return json.loads( + Path( + datadir, "https_fastapi.metacpan.org", "v1_release__search_incremental" + ).read_bytes() + ) + + @pytest.fixture(autouse=True) def mock_network_requests( requests_mock, @@ -56,6 +67,7 @@ release_scroll_second_response, release_scroll_third_response, release_scroll_fourth_response, + incremental_release_search_response, ): requests_mock.get( "https://fastapi.metacpan.org/v1/release/_search", @@ -80,6 +92,15 @@ ], ) + incremental_matcher = re.compile( + r"^https://fastapi.metacpan.org/v1/release/_search\?.*&source=\%7B%22query%22%3A.*" + ) + + requests_mock.get( + url=incremental_matcher, + json=incremental_release_search_response, + ) + @pytest.mark.parametrize( "module_name,module_version,release_name,expected_version", @@ -169,3 +190,26 @@ "artifacts": expected_artifacts[origin.url], "module_metadata": expected_module_metadata[origin.url], } + + +def test_cpan_lister_incremental( + swh_scheduler, +): + # First run + lister = CpanLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 11 + assert lister.state.last_listing_date + + # Second run + lister = CpanLister(scheduler=swh_scheduler) + # Force lister.state.last_listing_date for correct fixture loading + lister.state.last_listing_date = datetime(2022, 11, 8, 0, 0, tzinfo=timezone.utc) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 3 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 11 + 3