Page MenuHomeSoftware Heritage

D8663.id31926.diff
No OneTemporary

D8663.id31926.diff

diff --git a/swh/lister/hackage/__init__.py b/swh/lister/hackage/__init__.py
--- a/swh/lister/hackage/__init__.py
+++ b/swh/lister/hackage/__init__.py
@@ -20,7 +20,7 @@
---------------------------
To get a list of all package names we make a POST call to
-`https://hackage.haskell.org/packages/search` endpoint with some params given as
+``https://hackage.haskell.org/packages/search`` endpoint with some params given as
json data.
Default params::
@@ -35,6 +35,10 @@
The page size is 50. The lister will make has much http api call has needed to get
all results.
+For incremental mode we expand the search query with ``lastUpload`` greater than
+``state.last_listing_date``, the api will return all new or updated package names since
+last run.
+
Page listing
------------
@@ -60,7 +64,7 @@
-----------------
The lister yields 50 origins url per page.
-Each ListedOrigin has a `last_update` date set.
+Each ListedOrigin has a ``last_update`` date set.
Running tests
-------------
diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py
--- a/swh/lister/hackage/lister.py
+++ b/swh/lister/hackage/lister.py
@@ -3,6 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from dataclasses import dataclass
+from datetime import datetime, timezone
import logging
from typing import Any, Dict, Iterator, List, Optional
@@ -11,7 +13,7 @@
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@@ -19,7 +21,15 @@
HackageListerPage = List[Dict[str, Any]]
-class HackageLister(StatelessLister[HackageListerPage]):
+@dataclass
+class HackageListerState:
+ """Store lister state for incremental mode operations"""
+
+ last_listing_date: Optional[datetime] = None
+ """Last date when Hackage lister was executed"""
+
+
+class HackageLister(Lister[HackageListerState, HackageListerPage]):
"""List Hackage (The Haskell Package Repository) origins."""
LISTER_NAME = "hackage"
@@ -45,6 +55,20 @@
# Ensure to set this with same value as the http api search endpoint use
# (50 as of august 2022)
self.page_size: int = 50
+ self.listing_date = datetime.now().astimezone(tz=timezone.utc)
+
+ def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState:
+ last_listing_date = d.get("last_listing_date")
+ if last_listing_date is not None:
+ d["last_listing_date"] = iso8601.parse_date(last_listing_date)
+ return HackageListerState(**d)
+
+ def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]:
+ d: Dict[str, Optional[str]] = {"last_listing_date": None}
+ last_listing_date = state.last_listing_date
+ if last_listing_date is not None:
+ d["last_listing_date"] = last_listing_date.isoformat()
+ return d
def get_pages(self) -> Iterator[HackageListerPage]:
"""Yield an iterator which returns 'page'
@@ -54,11 +78,24 @@
Results are paginated.
"""
+ # Search query
+ sq = "(deprecated:any)"
+
+ if self.state.last_listing_date:
+ last_str = (
+ self.state.last_listing_date.astimezone(tz=timezone.utc)
+ .date()
+ .isoformat()
+ )
+
+ # Incremental mode search query
+ sq += "(lastUpload >= %s)" % last_str
+
params = {
"page": 0,
"sortColumn": "default",
"sortDirection": "ascending",
- "searchQuery": "(deprecated:any)",
+ "searchQuery": sq,
}
data = self.http_request(
@@ -67,20 +104,22 @@
json=params,
).json()
- nb_entries: int = data["numberOfResults"]
- (nb_pages, remainder) = divmod(nb_entries, self.page_size)
- if remainder:
- nb_pages += 1
- yield data["pageContents"]
-
- for page in range(1, nb_pages):
- params["page"] = page
- data = self.http_request(
- url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
- method="POST",
- json=params,
- ).json()
+ if data.get("pageContents"):
+ nb_entries: int = data["numberOfResults"]
+ (nb_pages, remainder) = divmod(nb_entries, self.page_size)
+ if remainder:
+ nb_pages += 1
+ # First page
yield data["pageContents"]
+ # Next pages
+ for page in range(1, nb_pages):
+ params["page"] = page
+ data = self.http_request(
+ url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
+ method="POST",
+ json=params,
+ ).json()
+ yield data["pageContents"]
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
@@ -92,9 +131,14 @@
url = self.PACKAGE_INFO_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
+
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=last_update,
)
+
+ def finalize(self) -> None:
+ self.state.last_listing_date = self.listing_date
+ self.updated = True
diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1 b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1
new file mode 100644
--- /dev/null
+++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1
@@ -0,0 +1 @@
+{"numberOfResults":3,"pageContents":[{"description":"Translations of classic Truth Maintenance Systems","downloads":14,"lastUpload":"2022-09-13T19:21:15.533437837Z","maintainers":[{"display":"jpmrst","uri":"/user/jpmrst"}],"name":{"display":"BPS","uri":"/package/BPS"},"tags":[{"display":"gpl","uri":"/packages/tag/gpl"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"truth-maintenance","uri":"/packages/tag/truth-maintenance"}],"votes":0},{"description":"C-Structs implementation for Haskell","downloads":25,"lastUpload":"2022-09-30T08:00:34.348551203Z","maintainers":[{"display":"SimonPlakolb","uri":"/user/SimonPlakolb"}],"name":{"display":"C-structs","uri":"/package/C-structs"},"tags":[{"display":"c","uri":"/packages/tag/c"},{"display":"data","uri":"/packages/tag/data"},{"display":"foreign","uri":"/packages/tag/foreign"},{"display":"library","uri":"/packages/tag/library"},{"display":"mit","uri":"/packages/tag/mit"},{"display":"structures","uri":"/packages/tag/structures"}],"votes":2},{"description":"Cluster algorithms, PCA, and chemical conformere analysis","downloads":29,"lastUpload":"2022-09-28T11:54:25.8011197Z","maintainers":[{"display":"phillipseeber","uri":"/user/phillipseeber"}],"name":{"display":"ConClusion","uri":"/package/ConClusion"},"tags":[{"display":"agpl","uri":"/packages/tag/agpl"},{"display":"chemistry","uri":"/packages/tag/chemistry"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"statistics","uri":"/packages/tag/statistics"}],"votes":2}]}
diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2 b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2
new file mode 100644
--- /dev/null
+++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2
@@ -0,0 +1 @@
+{"numberOfResults":0,"pageContents":[]}
diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py
--- a/swh/lister/hackage/tests/test_lister.py
+++ b/swh/lister/hackage/tests/test_lister.py
@@ -8,25 +8,31 @@
from pathlib import Path
from urllib.parse import unquote, urlparse
-from swh.lister.hackage.lister import HackageLister
+import iso8601
+from swh.lister.hackage.lister import HackageLister, HackageListerState
-def json_callback(request, context, datadir):
- """Callback for requests_mock that load a json file regarding a page number"""
- page = request.json()["page"]
+def json_callback(request, context, datadir, visit=0):
+ """Callback for requests_mock that load a json file regarding a page number"""
unquoted_url = unquote(request.url)
url = urlparse(unquoted_url)
+ page = request.json()["page"]
+
dirname = "%s_%s" % (url.scheme, url.hostname)
filename = url.path[1:]
if filename.endswith("/"):
filename = filename[:-1]
filename = filename.replace("/", "_")
+ filepath = Path(datadir, dirname, f"{filename}_{page}")
- return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
+ if visit > 0:
+ filepath = filepath.parent / f"{filepath.stem}_visit{visit}"
+ return json.loads(filepath.read_text())
def test_hackage_lister(swh_scheduler, requests_mock, datadir):
+ """Assert a full listing of 3 pages of 50 origins"""
requests_mock.post(
url="https://hackage.haskell.org/packages/search",
@@ -74,6 +80,10 @@
def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
+ """Test Pagination
+
+ Page size is 50, lister returns 1 page when origins < page size
+ """
requests_mock.post(
url="https://fake49.haskell.org/packages/search",
status_code=200,
@@ -87,6 +97,10 @@
def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
+ """Test Pagination
+
+ Page size is 50, lister returns 2 page when origins > page size
+ """
requests_mock.post(
url="https://fake51.haskell.org/packages/search",
status_code=200,
@@ -98,3 +112,86 @@
assert len(pages) == 2
assert len(pages[0]) == 50
assert len(pages[1]) == 1
+
+
+def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
+ """Test incremental lister
+
+ * First run, full listing, 3 pages, 150 origins
+ * Second run, 1 page, 3 new or updated origins
+ * Third run, nothing new, 0 page, 0 origins
+ """
+
+ mock_url = "https://hackage.haskell.org/packages/search"
+
+ # first run
+ requests_mock.post(
+ url=mock_url,
+ status_code=200,
+ json=functools.partial(json_callback, datadir=datadir),
+ )
+ lister = HackageLister(scheduler=swh_scheduler)
+ # force lister.last_listing_date to not being 'now'
+ lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
+ lister.set_state_in_scheduler()
+ assert lister.get_state_from_scheduler() == HackageListerState(
+ last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
+ )
+
+ first = lister.run()
+ assert first.pages == 3
+ assert first.origins == 3 * 50
+ # 3 http requests done
+ assert len(requests_mock.request_history) == 3
+ for rh in requests_mock.request_history:
+ assert rh.json()["searchQuery"] == "(deprecated:any)(lastUpload >= 2022-08-26)"
+
+ # second run
+ requests_mock.post(
+ url=mock_url,
+ status_code=200,
+ json=functools.partial(json_callback, datadir=datadir, visit=1),
+ )
+ lister = HackageLister(scheduler=swh_scheduler)
+ # force lister.last_listing_date to not being 'now'
+ lister.state.last_listing_date = iso8601.parse_date(
+ "2022-09-30T08:00:34.348551203Z"
+ )
+ lister.set_state_in_scheduler()
+ assert lister.get_state_from_scheduler() == HackageListerState(
+ last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
+ )
+
+ second = lister.run()
+ assert second.pages == 1
+ assert second.origins == 3
+
+ assert len(requests_mock.request_history) == 3 + 1
+ # Check the first three ones, should be the same as first run
+ for i in range(3):
+ assert (
+ requests_mock.request_history[i].json()["searchQuery"]
+ == "(deprecated:any)(lastUpload >= 2022-08-26)"
+ )
+ # Check the last one, lastUpload should be the same as second run
+ assert (
+ requests_mock.last_request.json()["searchQuery"]
+ == "(deprecated:any)(lastUpload >= 2022-09-30)"
+ )
+
+ # third run (no update since last run, no new or updated origins but one http requests
+ # with no results)
+ requests_mock.post(
+ url=mock_url,
+ status_code=200,
+ json=functools.partial(json_callback, datadir=datadir, visit=2),
+ )
+ lister = HackageLister(scheduler=swh_scheduler)
+ third = lister.run()
+
+ assert third.pages == 0
+ assert third.origins == 0
+ assert lister.get_state_from_scheduler() == HackageListerState(
+ last_listing_date=lister.state.last_listing_date
+ )
+ assert len(requests_mock.request_history) == 3 + 1 + 1

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 9:59 AM (15 h, 49 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215771

Event Timeline