diff --git a/swh/lister/hackage/__init__.py b/swh/lister/hackage/__init__.py --- a/swh/lister/hackage/__init__.py +++ b/swh/lister/hackage/__init__.py @@ -20,7 +20,7 @@ --------------------------- To get a list of all package names we make a POST call to -`https://hackage.haskell.org/packages/search` endpoint with some params given as +``https://hackage.haskell.org/packages/search`` endpoint with some params given as json data. Default params:: @@ -35,6 +35,10 @@ The page size is 50. The lister will make has much http api call has needed to get all results. +For incremental mode we expand the search query with ``lastUpload`` greater than +``state.last_listing_date``, the api will return all new or updated package names since +last run. + Page listing ------------ @@ -60,7 +64,7 @@ ----------------- The lister yields 50 origins url per page. -Each ListedOrigin has a `last_update` date set. +Each ListedOrigin has a ``last_update`` date set. Running tests ------------- diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py --- a/swh/lister/hackage/lister.py +++ b/swh/lister/hackage/lister.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import dataclass +from datetime import datetime, timezone import logging from typing import Any, Dict, Iterator, List, Optional @@ -11,7 +13,7 @@ from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -19,7 +21,15 @@ HackageListerPage = List[Dict[str, Any]] -class HackageLister(StatelessLister[HackageListerPage]): +@dataclass +class HackageListerState: + """Store lister state for incremental mode operations""" + + last_listing_date: Optional[datetime] = None + """Last date when Hackage lister was executed""" + + +class HackageLister(Lister[HackageListerState, HackageListerPage]): """List Hackage (The Haskell Package Repository) origins.""" LISTER_NAME = "hackage" @@ -45,6 +55,20 @@ # Ensure to set this with same value as the http api search endpoint use # (50 as of august 2022) self.page_size: int = 50 + self.listing_date = datetime.now().astimezone(tz=timezone.utc) + + def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState: + last_listing_date = d.get("last_listing_date") + if last_listing_date is not None: + d["last_listing_date"] = iso8601.parse_date(last_listing_date) + return HackageListerState(**d) + + def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]: + d: Dict[str, Optional[str]] = {"last_listing_date": None} + last_listing_date = state.last_listing_date + if last_listing_date is not None: + d["last_listing_date"] = last_listing_date.isoformat() + return d def get_pages(self) -> Iterator[HackageListerPage]: """Yield an iterator which returns 'page' @@ -54,11 +78,24 @@ Results are paginated. """ + # Search query + sq = "(deprecated:any)" + + if self.state.last_listing_date: + last_str = ( + self.state.last_listing_date.astimezone(tz=timezone.utc) + .date() + .isoformat() + ) + + # Incremental mode search query + sq += "(lastUpload >= %s)" % last_str + params = { "page": 0, "sortColumn": "default", "sortDirection": "ascending", - "searchQuery": "(deprecated:any)", + "searchQuery": sq, } data = self.http_request( @@ -67,20 +104,22 @@ json=params, ).json() - nb_entries: int = data["numberOfResults"] - (nb_pages, remainder) = divmod(nb_entries, self.page_size) - if remainder: - nb_pages += 1 - yield data["pageContents"] - - for page in range(1, nb_pages): - params["page"] = page - data = self.http_request( - url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), - method="POST", - json=params, - ).json() + if data.get("pageContents"): + nb_entries: int = data["numberOfResults"] + (nb_pages, remainder) = divmod(nb_entries, self.page_size) + if remainder: + nb_pages += 1 + # First page yield data["pageContents"] + # Next pages + for page in range(1, nb_pages): + params["page"] = page + data = self.http_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), + method="POST", + json=params, + ).json() + yield data["pageContents"] def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" @@ -92,9 +131,14 @@ url = self.PACKAGE_INFO_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) + yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, ) + + def finalize(self) -> None: + self.state.last_listing_date = self.listing_date + self.updated = True diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1 b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1 new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1 @@ -0,0 +1 @@ +{"numberOfResults":3,"pageContents":[{"description":"Translations of classic Truth Maintenance Systems","downloads":14,"lastUpload":"2022-09-13T19:21:15.533437837Z","maintainers":[{"display":"jpmrst","uri":"/user/jpmrst"}],"name":{"display":"BPS","uri":"/package/BPS"},"tags":[{"display":"gpl","uri":"/packages/tag/gpl"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"truth-maintenance","uri":"/packages/tag/truth-maintenance"}],"votes":0},{"description":"C-Structs implementation for Haskell","downloads":25,"lastUpload":"2022-09-30T08:00:34.348551203Z","maintainers":[{"display":"SimonPlakolb","uri":"/user/SimonPlakolb"}],"name":{"display":"C-structs","uri":"/package/C-structs"},"tags":[{"display":"c","uri":"/packages/tag/c"},{"display":"data","uri":"/packages/tag/data"},{"display":"foreign","uri":"/packages/tag/foreign"},{"display":"library","uri":"/packages/tag/library"},{"display":"mit","uri":"/packages/tag/mit"},{"display":"structures","uri":"/packages/tag/structures"}],"votes":2},{"description":"Cluster algorithms, PCA, and chemical conformere analysis","downloads":29,"lastUpload":"2022-09-28T11:54:25.8011197Z","maintainers":[{"display":"phillipseeber","uri":"/user/phillipseeber"}],"name":{"display":"ConClusion","uri":"/package/ConClusion"},"tags":[{"display":"agpl","uri":"/packages/tag/agpl"},{"display":"chemistry","uri":"/packages/tag/chemistry"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"statistics","uri":"/packages/tag/statistics"}],"votes":2}]} diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2 b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2 new file mode 100644 --- /dev/null +++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2 @@ -0,0 +1 @@ +{"numberOfResults":0,"pageContents":[]} diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py --- a/swh/lister/hackage/tests/test_lister.py +++ b/swh/lister/hackage/tests/test_lister.py @@ -8,25 +8,31 @@ from pathlib import Path from urllib.parse import unquote, urlparse -from swh.lister.hackage.lister import HackageLister +import iso8601 +from swh.lister.hackage.lister import HackageLister, HackageListerState -def json_callback(request, context, datadir): - """Callback for requests_mock that load a json file regarding a page number""" - page = request.json()["page"] +def json_callback(request, context, datadir, visit=0): + """Callback for requests_mock that load a json file regarding a page number""" unquoted_url = unquote(request.url) url = urlparse(unquoted_url) + page = request.json()["page"] + dirname = "%s_%s" % (url.scheme, url.hostname) filename = url.path[1:] if filename.endswith("/"): filename = filename[:-1] filename = filename.replace("/", "_") + filepath = Path(datadir, dirname, f"{filename}_{page}") - return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text()) + if visit > 0: + filepath = filepath.parent / f"{filepath.stem}_visit{visit}" + return json.loads(filepath.read_text()) def test_hackage_lister(swh_scheduler, requests_mock, datadir): + """Assert a full listing of 3 pages of 50 origins""" requests_mock.post( url="https://hackage.haskell.org/packages/search", @@ -74,6 +80,10 @@ def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir): + """Test Pagination + + Page size is 50, lister returns 1 page when origins < page size + """ requests_mock.post( url="https://fake49.haskell.org/packages/search", status_code=200, @@ -87,6 +97,10 @@ def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir): + """Test Pagination + + Page size is 50, lister returns 2 page when origins > page size + """ requests_mock.post( url="https://fake51.haskell.org/packages/search", status_code=200, @@ -98,3 +112,86 @@ assert len(pages) == 2 assert len(pages[0]) == 50 assert len(pages[1]) == 1 + + +def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir): + """Test incremental lister + + * First run, full listing, 3 pages, 150 origins + * Second run, 1 page, 3 new or updated origins + * Third run, nothing new, 0 page, 0 origins + """ + + mock_url = "https://hackage.haskell.org/packages/search" + + # first run + requests_mock.post( + url=mock_url, + status_code=200, + json=functools.partial(json_callback, datadir=datadir), + ) + lister = HackageLister(scheduler=swh_scheduler) + # force lister.last_listing_date to not being 'now' + lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z") + lister.set_state_in_scheduler() + assert lister.get_state_from_scheduler() == HackageListerState( + last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z") + ) + + first = lister.run() + assert first.pages == 3 + assert first.origins == 3 * 50 + # 3 http requests done + assert len(requests_mock.request_history) == 3 + for rh in requests_mock.request_history: + assert rh.json()["searchQuery"] == "(deprecated:any)(lastUpload >= 2022-08-26)" + + # second run + requests_mock.post( + url=mock_url, + status_code=200, + json=functools.partial(json_callback, datadir=datadir, visit=1), + ) + lister = HackageLister(scheduler=swh_scheduler) + # force lister.last_listing_date to not being 'now' + lister.state.last_listing_date = iso8601.parse_date( + "2022-09-30T08:00:34.348551203Z" + ) + lister.set_state_in_scheduler() + assert lister.get_state_from_scheduler() == HackageListerState( + last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z") + ) + + second = lister.run() + assert second.pages == 1 + assert second.origins == 3 + + assert len(requests_mock.request_history) == 3 + 1 + # Check the first three ones, should be the same as first run + for i in range(3): + assert ( + requests_mock.request_history[i].json()["searchQuery"] + == "(deprecated:any)(lastUpload >= 2022-08-26)" + ) + # Check the last one, lastUpload should be the same as second run + assert ( + requests_mock.last_request.json()["searchQuery"] + == "(deprecated:any)(lastUpload >= 2022-09-30)" + ) + + # third run (no update since last run, no new or updated origins but one http requests + # with no results) + requests_mock.post( + url=mock_url, + status_code=200, + json=functools.partial(json_callback, datadir=datadir, visit=2), + ) + lister = HackageLister(scheduler=swh_scheduler) + third = lister.run() + + assert third.pages == 0 + assert third.origins == 0 + assert lister.get_state_from_scheduler() == HackageListerState( + last_listing_date=lister.state.last_listing_date + ) + assert len(requests_mock.request_history) == 3 + 1 + 1