Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/hackage/lister.py
| # Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| from dataclasses import dataclass | |||||
| from datetime import datetime | |||||
| import logging | import logging | ||||
| from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
| import iso8601 | import iso8601 | ||||
| from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
| from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
| from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | ||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
| # Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | ||||
| HackageListerPage = List[Dict[str, Any]] | HackageListerPage = List[Dict[str, Any]] | ||||
| class HackageLister(StatelessLister[HackageListerPage]): | @dataclass | ||||
| class HackageListerState: | |||||
| """Store lister state for incremental mode operations | |||||
| 'index_last_update' is the earliest 'last_update' origin seen on last run | |||||
| """ | |||||
| index_last_update: Optional[datetime] = None | |||||
| class HackageLister(Lister[HackageListerState, HackageListerPage]): | |||||
| """List Hackage (The Haskell Package Repository) origins.""" | """List Hackage (The Haskell Package Repository) origins.""" | ||||
| LISTER_NAME = "hackage" | LISTER_NAME = "hackage" | ||||
| VISIT_TYPE = "hackage" | VISIT_TYPE = "hackage" | ||||
| INSTANCE = "hackage" | INSTANCE = "hackage" | ||||
| BASE_URL = "https://hackage.haskell.org/" | BASE_URL = "https://hackage.haskell.org/" | ||||
| PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search" | PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search" | ||||
| Show All 9 Lines | ): | ||||
| scheduler=scheduler, | scheduler=scheduler, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
| url=url if url else self.BASE_URL, | url=url if url else self.BASE_URL, | ||||
| ) | ) | ||||
| # Ensure to set this with same value as the http api search endpoint use | # Ensure to set this with same value as the http api search endpoint use | ||||
| # (50 as of august 2022) | # (50 as of august 2022) | ||||
| self.page_size: int = 50 | self.page_size: int = 50 | ||||
| # when iterating over origins, store the earliest last_update for incremental | |||||
| # purpose | |||||
| self.earliest_update: Optional[datetime] = None | |||||
| def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState: | |||||
| index_last_update = d.get("index_last_update") | |||||
| if index_last_update is not None: | |||||
| d["index_last_update"] = iso8601.parse_date(index_last_update) | |||||
| return HackageListerState(**d) | |||||
| def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]: | |||||
| d: Dict[str, Optional[str]] = {"index_last_update": None} | |||||
| index_last_update = state.index_last_update | |||||
| if index_last_update is not None: | |||||
| d["index_last_update"] = index_last_update.isoformat() | |||||
| return d | |||||
| def get_pages(self) -> Iterator[HackageListerPage]: | def get_pages(self) -> Iterator[HackageListerPage]: | ||||
| """Yield an iterator which returns 'page' | """Yield an iterator which returns 'page' | ||||
| It uses the http api endpoint `https://hackage.haskell.org/packages/search` | It uses the http api endpoint `https://hackage.haskell.org/packages/search` | ||||
| to get a list of package names from which we build an origin url. | to get a list of package names from which we build an origin url. | ||||
| Results are paginated. | Results are paginated. | ||||
| """ | """ | ||||
| # Search query | |||||
| sq = "(deprecated:any)" | |||||
| if self.state.index_last_update: | |||||
| index_last_update_str = self.state.index_last_update.date().isoformat() | |||||
| # Incremental mode search query | |||||
| sq += "(lastUpload > %s)" % index_last_update_str | |||||
| params = { | params = { | ||||
| "page": 0, | "page": 0, | ||||
| "sortColumn": "default", | "sortColumn": "default", | ||||
| "sortDirection": "ascending", | "sortDirection": "ascending", | ||||
| "searchQuery": "(deprecated:any)", | "searchQuery": sq, | ||||
| } | } | ||||
| data = self.http_request( | data = self.http_request( | ||||
| url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | ||||
| method="POST", | method="POST", | ||||
| json=params, | json=params, | ||||
| ).json() | ).json() | ||||
| if data.get("pageContents"): | |||||
| nb_entries: int = data["numberOfResults"] | nb_entries: int = data["numberOfResults"] | ||||
| (nb_pages, remainder) = divmod(nb_entries, self.page_size) | (nb_pages, remainder) = divmod(nb_entries, self.page_size) | ||||
| if remainder: | if remainder: | ||||
| nb_pages += 1 | nb_pages += 1 | ||||
| # First page | |||||
| yield data["pageContents"] | yield data["pageContents"] | ||||
| # Next pages | |||||
| for page in range(1, nb_pages): | for page in range(1, nb_pages): | ||||
| params["page"] = page | params["page"] = page | ||||
| data = self.http_request( | data = self.http_request( | ||||
| url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | ||||
| method="POST", | method="POST", | ||||
| json=params, | json=params, | ||||
| ).json() | ).json() | ||||
| yield data["pageContents"] | yield data["pageContents"] | ||||
| def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: | ||||
| """Iterate on all pages and yield ListedOrigin instances.""" | """Iterate on all pages and yield ListedOrigin instances.""" | ||||
| assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
| for entry in page: | for entry in page: | ||||
| pkgname = entry["name"]["display"] | pkgname = entry["name"]["display"] | ||||
| last_update = iso8601.parse_date(entry["lastUpload"]) | last_update = iso8601.parse_date(entry["lastUpload"]) | ||||
| url = self.PACKAGE_INFO_URL_PATTERN.format( | url = self.PACKAGE_INFO_URL_PATTERN.format( | ||||
| base_url=self.url, pkgname=pkgname | base_url=self.url, pkgname=pkgname | ||||
| ) | ) | ||||
| if not self.earliest_update or last_update > self.earliest_update: | |||||
| self.earliest_update = last_update | |||||
| yield ListedOrigin( | yield ListedOrigin( | ||||
| lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
| visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
| url=url, | url=url, | ||||
| last_update=last_update, | last_update=last_update, | ||||
| ) | ) | ||||
| def finalize(self) -> None: | |||||
| # the earliest last_update value seen during last listing | |||||
| last = self.earliest_update | |||||
| # Set or update state.index_last_update | |||||
| if not self.state.index_last_update or ( | |||||
| last and last > self.state.index_last_update | |||||
| ): | |||||
| self.state.index_last_update = last | |||||
| self.updated = True | |||||