Page MenuHomeSoftware Heritage

D8824.id31816.diff
No OneTemporary

D8824.id31816.diff

diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py
--- a/swh/lister/cpan/__init__.py
+++ b/swh/lister/cpan/__init__.py
@@ -20,6 +20,10 @@
a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will
be used to scroll pages through `search`_ endpoint.
+The lister is incremental, it stores the UTC date the lister has been executed as
+``lister.state.last_listing_date``. When present that value is used to filter results
+which have an UTC `date`_ greater or equal.
+
Page listing
------------
@@ -58,10 +62,10 @@
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
-.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
-
+.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints
+.. _date: https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html#date-params
-"""
+""" # noqa: B950
def register():
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
--- a/swh/lister/cpan/lister.py
+++ b/swh/lister/cpan/lister.py
@@ -4,7 +4,9 @@
# See top-level LICENSE file for more information
from collections import defaultdict
+from dataclasses import dataclass
from datetime import datetime
+import json
import logging
from typing import Any, Dict, Iterator, List, Optional, Set, Union
@@ -13,14 +15,23 @@
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
+
# Aliasing the page results returned by `get_pages` method from the lister.
CpanListerPage = Set[str]
+@dataclass
+class CpanListerState:
+ """Store lister state for incremental mode operations"""
+
+ last_listing_date: Optional[datetime] = None
+ """Last date when Cpan lister was executed"""
+
+
def get_field_value(entry, field_name):
"""
Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
@@ -59,7 +70,7 @@
return str(module_version)
-class CpanLister(StatelessLister[CpanListerPage]):
+class CpanLister(Lister[CpanListerState, CpanListerPage]):
"""The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive
Network."""
@@ -93,6 +104,21 @@
self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
self.module_names: Set[str] = set()
+ # Store the datetime the lister runs for incremental purpose
+ self.listing_date = datetime.now()
+
+ def state_from_dict(self, d: Dict[str, Any]) -> CpanListerState:
+ last_listing_date = d.get("last_listing_date")
+ if last_listing_date is not None:
+ d["last_listing_date"] = iso8601.parse_date(last_listing_date)
+ return CpanListerState(**d)
+
+ def state_to_dict(self, state: CpanListerState) -> Dict[str, Any]:
+ d: Dict[str, Optional[str]] = {"last_listing_date": None}
+ last_listing_date = state.last_listing_date
+ if last_listing_date is not None:
+ d["last_listing_date"] = last_listing_date.isoformat()
+ return d
def process_release_page(self, page: List[Dict[str, Any]]):
for entry in page:
@@ -155,14 +181,28 @@
endpoint = f"{self.API_BASE_URL}/release/_search"
scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
size = 1000
+ params = {
+ "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
+ "size": size,
+ "scroll": "1m",
+ }
+
+ if self.state.last_listing_date:
+ # Incremental mode
+ # Add a query entry to params that will act as a greater or equal date filter
+ params["source"] = json.dumps(
+ {
+ "query": {
+ "range": {
+ "date": {"gte": self.state.last_listing_date.isoformat()}
+ }
+ }
+ }
+ )
res = self.http_request(
endpoint,
- params={
- "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
- "size": size,
- "scroll": "1m",
- },
+ params=params,
)
data = res.json()["hits"]["hits"]
self.process_release_page(data)
@@ -197,3 +237,7 @@
"module_metadata": self.module_metadata[module_name],
},
)
+
+ def finalize(self) -> None:
+ self.state.last_listing_date = self.listing_date
+ self.updated = True
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
@@ -243,4 +243,4 @@
},
"took": 14,
"_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw=="
-}
\ No newline at end of file
+}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental
@@ -0,0 +1,85 @@
+{
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7MzsxOTk5Mzc4ODE6SUVISWZqMlBRTUdUbDU4c2l3R3ZBQTsxOTk5Mzc4ODI6SUVISWZqMlBRTUdUbDU4c2l3R3ZBQTsxMjk2ODI1MTYzOnl6bXZrM0FDVGptV21SY0Y0ZEZPVHc7MDs=",
+ "hits" : {
+ "total" : 3,
+ "hits" : [
+ {
+ "_index" : "cpan_v1_01",
+ "_id" : "OeC8IpwwhBktkazmdTcwIiKng4c",
+ "_score" : 1.0,
+ "_type" : "release",
+ "_source" : {
+ "checksum_sha256" : "0969535a1df17f570c16949d730eae2df6bb51d3140fa99416377e7b798ead62",
+ "author" : "PERLANCAR",
+ "name" : "Pod-Weaver-Plugin-Data-Sah-Filter-0.007",
+ "distribution" : "Pod-Weaver-Plugin-Data-Sah-Filter",
+ "date" : "2022-11-08T00:05:36",
+ "stat" : {
+ "size" : 17164
+ },
+ "version" : "0.007",
+ "download_url" : "https://cpan.metacpan.org/authors/id/P/PE/PERLANCAR/Pod-Weaver-Plugin-Data-Sah-Filter-0.007.tar.gz",
+ "metadata" : {
+ "author" : [
+ "perlancar <perlancar@cpan.org>"
+ ]
+ }
+ }
+ },
+ {
+ "_id" : "7AsfCaXwRnHlwkLbf90_Sxu2Ua8",
+ "_index" : "cpan_v1_01",
+ "_type" : "release",
+ "_score" : 1.0,
+ "_source" : {
+ "checksum_sha256" : "84a523b2e2d7459736442243f3d7c5833e08b267ab07eaa335bd25171429fc9c",
+ "author" : "SKIM",
+ "distribution" : "App-Kramerius-To-Images",
+ "name" : "App-Kramerius-To-Images-0.04",
+ "date" : "2022-11-08T20:24:20",
+ "stat" : {
+ "size" : 29052
+ },
+ "version" : "0.04",
+ "download_url" : "https://cpan.metacpan.org/authors/id/S/SK/SKIM/App-Kramerius-To-Images-0.04.tar.gz",
+ "metadata" : {
+ "author" : [
+ "Michal Josef Spacek <skim@cpan.org>"
+ ]
+ }
+ }
+ },
+ {
+ "_source" : {
+ "download_url" : "https://cpan.metacpan.org/authors/id/D/DE/DERIV/Math-Business-Lookback-0.01.tar.gz",
+ "metadata" : {
+ "author" : [
+ "binary.com <binary@cpan.org>"
+ ]
+ },
+ "date" : "2022-11-09T06:37:09",
+ "name" : "Math-Business-Lookback-0.01",
+ "author" : "DERIV",
+ "distribution" : "Math-Business-Lookback",
+ "checksum_sha256" : "c8d4addf4b57033acb37980ac0652e8f80b325d17210927b10acb297c8840447",
+ "version" : "0.01",
+ "stat" : {
+ "size" : 18852
+ }
+ },
+ "_type" : "release",
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "Q_Ahjx4TqbFCqjpagPjPjCG9DjI"
+ }
+ ],
+ "max_score" : 1.0
+ },
+ "_shards" : {
+ "failed" : 0,
+ "successful" : 3,
+ "total" : 3
+ },
+ "timed_out" : false,
+ "took" : 12
+}
diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py
--- a/swh/lister/cpan/tests/test_lister.py
+++ b/swh/lister/cpan/tests/test_lister.py
@@ -4,9 +4,11 @@
# See top-level LICENSE file for more information
from collections import defaultdict
+from datetime import datetime, timezone
from itertools import chain
import json
from pathlib import Path
+import re
import pytest
@@ -48,6 +50,15 @@
return release_scroll_response(datadir, page=4)
+@pytest.fixture
+def incremental_release_search_response(datadir):
+ return json.loads(
+ Path(
+ datadir, "https_fastapi.metacpan.org", "v1_release__search_incremental"
+ ).read_bytes()
+ )
+
+
@pytest.fixture(autouse=True)
def mock_network_requests(
requests_mock,
@@ -56,6 +67,7 @@
release_scroll_second_response,
release_scroll_third_response,
release_scroll_fourth_response,
+ incremental_release_search_response,
):
requests_mock.get(
"https://fastapi.metacpan.org/v1/release/_search",
@@ -80,6 +92,15 @@
],
)
+ incremental_matcher = re.compile(
+ r"^https://fastapi.metacpan.org/v1/release/_search\?.*&source=\%7B%22query%22%3A.*"
+ )
+
+ requests_mock.get(
+ url=incremental_matcher,
+ json=incremental_release_search_response,
+ )
+
@pytest.mark.parametrize(
"module_name,module_version,release_name,expected_version",
@@ -169,3 +190,26 @@
"artifacts": expected_artifacts[origin.url],
"module_metadata": expected_module_metadata[origin.url],
}
+
+
+def test_cpan_lister_incremental(
+ swh_scheduler,
+):
+ # First run
+ lister = CpanLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 11
+ assert lister.state.last_listing_date
+
+ # Second run
+ lister = CpanLister(scheduler=swh_scheduler)
+ # Force lister.state.last_listing_date for correct fixture loading
+ lister.state.last_listing_date = datetime(2022, 11, 8, 0, 0, tzinfo=timezone.utc)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 3
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == 11 + 3

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 12:24 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217878

Event Timeline