Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/hackage/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | |||||
from datetime import datetime, timezone | |||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
import iso8601 | import iso8601 | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
# Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | ||||
HackageListerPage = List[Dict[str, Any]] | HackageListerPage = List[Dict[str, Any]] | ||||
class HackageLister(StatelessLister[HackageListerPage]): | @dataclass | ||||
class HackageListerState: | |||||
"""Store lister state for incremental mode operations""" | |||||
last_listing_date: Optional[datetime] = None | |||||
"""Last date when Hackage lister was executed""" | |||||
class HackageLister(Lister[HackageListerState, HackageListerPage]): | |||||
"""List Hackage (The Haskell Package Repository) origins.""" | """List Hackage (The Haskell Package Repository) origins.""" | ||||
LISTER_NAME = "hackage" | LISTER_NAME = "hackage" | ||||
VISIT_TYPE = "hackage" | VISIT_TYPE = "hackage" | ||||
INSTANCE = "hackage" | INSTANCE = "hackage" | ||||
BASE_URL = "https://hackage.haskell.org/" | BASE_URL = "https://hackage.haskell.org/" | ||||
PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search" | PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search" | ||||
Show All 9 Lines | ): | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=credentials, | credentials=credentials, | ||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
url=url if url else self.BASE_URL, | url=url if url else self.BASE_URL, | ||||
) | ) | ||||
# Ensure to set this with same value as the http api search endpoint use | # Ensure to set this with same value as the http api search endpoint use | ||||
# (50 as of august 2022) | # (50 as of august 2022) | ||||
self.page_size: int = 50 | self.page_size: int = 50 | ||||
self.listing_date = datetime.now().astimezone(tz=timezone.utc) | |||||
def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState: | |||||
last_listing_date = d.get("last_listing_date") | |||||
if last_listing_date is not None: | |||||
d["last_listing_date"] = iso8601.parse_date(last_listing_date) | |||||
return HackageListerState(**d) | |||||
def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]: | |||||
d: Dict[str, Optional[str]] = {"last_listing_date": None} | |||||
last_listing_date = state.last_listing_date | |||||
if last_listing_date is not None: | |||||
d["last_listing_date"] = last_listing_date.isoformat() | |||||
return d | |||||
def get_pages(self) -> Iterator[HackageListerPage]: | def get_pages(self) -> Iterator[HackageListerPage]: | ||||
"""Yield an iterator which returns 'page' | """Yield an iterator which returns 'page' | ||||
It uses the http api endpoint `https://hackage.haskell.org/packages/search` | It uses the http api endpoint `https://hackage.haskell.org/packages/search` | ||||
to get a list of package names from which we build an origin url. | to get a list of package names from which we build an origin url. | ||||
Results are paginated. | Results are paginated. | ||||
""" | """ | ||||
# Search query | |||||
sq = "(deprecated:any)" | |||||
if self.state.last_listing_date: | |||||
last_str = ( | |||||
self.state.last_listing_date.astimezone(tz=timezone.utc) | |||||
.date() | |||||
.isoformat() | |||||
) | |||||
# Incremental mode search query | |||||
sq += "(lastUpload >= %s)" % last_str | |||||
params = { | params = { | ||||
"page": 0, | "page": 0, | ||||
"sortColumn": "default", | "sortColumn": "default", | ||||
"sortDirection": "ascending", | "sortDirection": "ascending", | ||||
"searchQuery": "(deprecated:any)", | "searchQuery": sq, | ||||
} | } | ||||
data = self.http_request( | data = self.http_request( | ||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | ||||
method="POST", | method="POST", | ||||
json=params, | json=params, | ||||
).json() | ).json() | ||||
if data.get("pageContents"): | |||||
nb_entries: int = data["numberOfResults"] | nb_entries: int = data["numberOfResults"] | ||||
(nb_pages, remainder) = divmod(nb_entries, self.page_size) | (nb_pages, remainder) = divmod(nb_entries, self.page_size) | ||||
if remainder: | if remainder: | ||||
nb_pages += 1 | nb_pages += 1 | ||||
# First page | |||||
yield data["pageContents"] | yield data["pageContents"] | ||||
# Next pages | |||||
for page in range(1, nb_pages): | for page in range(1, nb_pages): | ||||
params["page"] = page | params["page"] = page | ||||
data = self.http_request( | data = self.http_request( | ||||
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), | ||||
method="POST", | method="POST", | ||||
json=params, | json=params, | ||||
).json() | ).json() | ||||
yield data["pageContents"] | yield data["pageContents"] | ||||
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]: | ||||
"""Iterate on all pages and yield ListedOrigin instances.""" | """Iterate on all pages and yield ListedOrigin instances.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for entry in page: | for entry in page: | ||||
pkgname = entry["name"]["display"] | pkgname = entry["name"]["display"] | ||||
last_update = iso8601.parse_date(entry["lastUpload"]) | last_update = iso8601.parse_date(entry["lastUpload"]) | ||||
url = self.PACKAGE_INFO_URL_PATTERN.format( | url = self.PACKAGE_INFO_URL_PATTERN.format( | ||||
base_url=self.url, pkgname=pkgname | base_url=self.url, pkgname=pkgname | ||||
) | ) | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
visit_type=self.VISIT_TYPE, | visit_type=self.VISIT_TYPE, | ||||
url=url, | url=url, | ||||
last_update=last_update, | last_update=last_update, | ||||
) | ) | ||||
def finalize(self) -> None: | |||||
self.state.last_listing_date = self.listing_date | |||||
self.updated = True |