Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123768
D8824.id31816.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D8824.id31816.diff
View Options
diff --git a/swh/lister/cpan/__init__.py b/swh/lister/cpan/__init__.py
--- a/swh/lister/cpan/__init__.py
+++ b/swh/lister/cpan/__init__.py
@@ -20,6 +20,10 @@
a first `http api endpoint`_ that retrieve results and a ``_scroll_id`` that will
be used to scroll pages through `search`_ endpoint.
+The lister is incremental, it stores the UTC date the lister has been executed as
+``lister.state.last_listing_date``. When present that value is used to filter results
+which have an UTC `date`_ greater or equal.
+
Page listing
------------
@@ -58,10 +62,10 @@
.. _cpan.org: https://cpan.org/
.. _metacpan.org: https://metacpan.org/
.. _http api endpoint: https://explorer.metacpan.org/?url=/release/
-.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints # noqa: B950
-
+.. _search: https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md#search-without-constraints
+.. _date: https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html#date-params
-"""
+""" # noqa: B950
def register():
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
--- a/swh/lister/cpan/lister.py
+++ b/swh/lister/cpan/lister.py
@@ -4,7 +4,9 @@
# See top-level LICENSE file for more information
from collections import defaultdict
+from dataclasses import dataclass
from datetime import datetime
+import json
import logging
from typing import Any, Dict, Iterator, List, Optional, Set, Union
@@ -13,14 +15,23 @@
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
+
# Aliasing the page results returned by `get_pages` method from the lister.
CpanListerPage = Set[str]
+@dataclass
+class CpanListerState:
+ """Store lister state for incremental mode operations"""
+
+ last_listing_date: Optional[datetime] = None
+ """Last date when Cpan lister was executed"""
+
+
def get_field_value(entry, field_name):
"""
Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
@@ -59,7 +70,7 @@
return str(module_version)
-class CpanLister(StatelessLister[CpanListerPage]):
+class CpanLister(Lister[CpanListerState, CpanListerPage]):
"""The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive
Network."""
@@ -93,6 +104,21 @@
self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
self.module_names: Set[str] = set()
+ # Store the datetime the lister runs for incremental purpose
+ self.listing_date = datetime.now()
+
+ def state_from_dict(self, d: Dict[str, Any]) -> CpanListerState:
+ last_listing_date = d.get("last_listing_date")
+ if last_listing_date is not None:
+ d["last_listing_date"] = iso8601.parse_date(last_listing_date)
+ return CpanListerState(**d)
+
+ def state_to_dict(self, state: CpanListerState) -> Dict[str, Any]:
+ d: Dict[str, Optional[str]] = {"last_listing_date": None}
+ last_listing_date = state.last_listing_date
+ if last_listing_date is not None:
+ d["last_listing_date"] = last_listing_date.isoformat()
+ return d
def process_release_page(self, page: List[Dict[str, Any]]):
for entry in page:
@@ -155,14 +181,28 @@
endpoint = f"{self.API_BASE_URL}/release/_search"
scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
size = 1000
+ params = {
+ "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
+ "size": size,
+ "scroll": "1m",
+ }
+
+ if self.state.last_listing_date:
+ # Incremental mode
+ # Add a query entry to params that will act as a greater or equal date filter
+ params["source"] = json.dumps(
+ {
+ "query": {
+ "range": {
+ "date": {"gte": self.state.last_listing_date.isoformat()}
+ }
+ }
+ }
+ )
res = self.http_request(
endpoint,
- params={
- "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
- "size": size,
- "scroll": "1m",
- },
+ params=params,
)
data = res.json()["hits"]["hits"]
self.process_release_page(data)
@@ -197,3 +237,7 @@
"module_metadata": self.module_metadata[module_name],
},
)
+
+ def finalize(self) -> None:
+ self.state.last_listing_date = self.listing_date
+ self.updated = True
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
--- a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
@@ -243,4 +243,4 @@
},
"took": 14,
"_scroll_id": "cXVlcnlUaGVuRmV0Y2g7Mzs5OTQ2NzY3ODU6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTQ2NzY3ODQ6ZHZIZWxCb3BUZi1Cb3NwRDB5NmRQUTs5OTUyMzQzMTA6eXptdmszQUNUam1XbVJjRjRkRk9UdzswOw=="
-}
\ No newline at end of file
+}
diff --git a/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental
new file mode 100644
--- /dev/null
+++ b/swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search_incremental
@@ -0,0 +1,85 @@
+{
+ "_scroll_id" : "cXVlcnlUaGVuRmV0Y2g7MzsxOTk5Mzc4ODE6SUVISWZqMlBRTUdUbDU4c2l3R3ZBQTsxOTk5Mzc4ODI6SUVISWZqMlBRTUdUbDU4c2l3R3ZBQTsxMjk2ODI1MTYzOnl6bXZrM0FDVGptV21SY0Y0ZEZPVHc7MDs=",
+ "hits" : {
+ "total" : 3,
+ "hits" : [
+ {
+ "_index" : "cpan_v1_01",
+ "_id" : "OeC8IpwwhBktkazmdTcwIiKng4c",
+ "_score" : 1.0,
+ "_type" : "release",
+ "_source" : {
+ "checksum_sha256" : "0969535a1df17f570c16949d730eae2df6bb51d3140fa99416377e7b798ead62",
+ "author" : "PERLANCAR",
+ "name" : "Pod-Weaver-Plugin-Data-Sah-Filter-0.007",
+ "distribution" : "Pod-Weaver-Plugin-Data-Sah-Filter",
+ "date" : "2022-11-08T00:05:36",
+ "stat" : {
+ "size" : 17164
+ },
+ "version" : "0.007",
+ "download_url" : "https://cpan.metacpan.org/authors/id/P/PE/PERLANCAR/Pod-Weaver-Plugin-Data-Sah-Filter-0.007.tar.gz",
+ "metadata" : {
+ "author" : [
+ "perlancar <perlancar@cpan.org>"
+ ]
+ }
+ }
+ },
+ {
+ "_id" : "7AsfCaXwRnHlwkLbf90_Sxu2Ua8",
+ "_index" : "cpan_v1_01",
+ "_type" : "release",
+ "_score" : 1.0,
+ "_source" : {
+ "checksum_sha256" : "84a523b2e2d7459736442243f3d7c5833e08b267ab07eaa335bd25171429fc9c",
+ "author" : "SKIM",
+ "distribution" : "App-Kramerius-To-Images",
+ "name" : "App-Kramerius-To-Images-0.04",
+ "date" : "2022-11-08T20:24:20",
+ "stat" : {
+ "size" : 29052
+ },
+ "version" : "0.04",
+ "download_url" : "https://cpan.metacpan.org/authors/id/S/SK/SKIM/App-Kramerius-To-Images-0.04.tar.gz",
+ "metadata" : {
+ "author" : [
+ "Michal Josef Spacek <skim@cpan.org>"
+ ]
+ }
+ }
+ },
+ {
+ "_source" : {
+ "download_url" : "https://cpan.metacpan.org/authors/id/D/DE/DERIV/Math-Business-Lookback-0.01.tar.gz",
+ "metadata" : {
+ "author" : [
+ "binary.com <binary@cpan.org>"
+ ]
+ },
+ "date" : "2022-11-09T06:37:09",
+ "name" : "Math-Business-Lookback-0.01",
+ "author" : "DERIV",
+ "distribution" : "Math-Business-Lookback",
+ "checksum_sha256" : "c8d4addf4b57033acb37980ac0652e8f80b325d17210927b10acb297c8840447",
+ "version" : "0.01",
+ "stat" : {
+ "size" : 18852
+ }
+ },
+ "_type" : "release",
+ "_score" : 1.0,
+ "_index" : "cpan_v1_01",
+ "_id" : "Q_Ahjx4TqbFCqjpagPjPjCG9DjI"
+ }
+ ],
+ "max_score" : 1.0
+ },
+ "_shards" : {
+ "failed" : 0,
+ "successful" : 3,
+ "total" : 3
+ },
+ "timed_out" : false,
+ "took" : 12
+}
diff --git a/swh/lister/cpan/tests/test_lister.py b/swh/lister/cpan/tests/test_lister.py
--- a/swh/lister/cpan/tests/test_lister.py
+++ b/swh/lister/cpan/tests/test_lister.py
@@ -4,9 +4,11 @@
# See top-level LICENSE file for more information
from collections import defaultdict
+from datetime import datetime, timezone
from itertools import chain
import json
from pathlib import Path
+import re
import pytest
@@ -48,6 +50,15 @@
return release_scroll_response(datadir, page=4)
+@pytest.fixture
+def incremental_release_search_response(datadir):
+ return json.loads(
+ Path(
+ datadir, "https_fastapi.metacpan.org", "v1_release__search_incremental"
+ ).read_bytes()
+ )
+
+
@pytest.fixture(autouse=True)
def mock_network_requests(
requests_mock,
@@ -56,6 +67,7 @@
release_scroll_second_response,
release_scroll_third_response,
release_scroll_fourth_response,
+ incremental_release_search_response,
):
requests_mock.get(
"https://fastapi.metacpan.org/v1/release/_search",
@@ -80,6 +92,15 @@
],
)
+ incremental_matcher = re.compile(
+ r"^https://fastapi.metacpan.org/v1/release/_search\?.*&source=\%7B%22query%22%3A.*"
+ )
+
+ requests_mock.get(
+ url=incremental_matcher,
+ json=incremental_release_search_response,
+ )
+
@pytest.mark.parametrize(
"module_name,module_version,release_name,expected_version",
@@ -169,3 +190,26 @@
"artifacts": expected_artifacts[origin.url],
"module_metadata": expected_module_metadata[origin.url],
}
+
+
+def test_cpan_lister_incremental(
+ swh_scheduler,
+):
+ # First run
+ lister = CpanLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 11
+ assert lister.state.last_listing_date
+
+ # Second run
+ lister = CpanLister(scheduler=swh_scheduler)
+ # Force lister.state.last_listing_date for correct fixture loading
+ lister.state.last_listing_date = datetime(2022, 11, 8, 0, 0, tzinfo=timezone.utc)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 3
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ assert len(scheduler_origins) == 11 + 3
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 12:24 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217878
Attached To
D8824: Cpan: Implement incremental mode
Event Timeline
Log In to Comment