Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pypi/lister.py
# Copyright (C) 2018-2021 The Software Heritage developers | # Copyright (C) 2018-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
from typing import Iterator, List, Optional | from typing import Iterator, List, Optional | ||||
from bs4 import BeautifulSoup | |||||
import requests | import requests | ||||
import xmltodict | |||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | from .. import USER_AGENT | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Show All 30 Lines | ): | ||||
) | ) | ||||
def get_pages(self) -> Iterator[PackageListPage]: | def get_pages(self) -> Iterator[PackageListPage]: | ||||
response = self.session.get(self.PACKAGE_LIST_URL) | response = self.session.get(self.PACKAGE_LIST_URL) | ||||
response.raise_for_status() | response.raise_for_status() | ||||
page_xmldict = xmltodict.parse(response.content) | page = BeautifulSoup(response.content, features="html.parser") | ||||
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]] | |||||
page_results = [p.text for p in page.find_all("a")] | |||||
yield page_results | yield page_results | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, packages_name: PackageListPage | self, packages_name: PackageListPage | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
"""Convert a page of PyPI repositories into a list of ListedOrigins.""" | """Convert a page of PyPI repositories into a list of ListedOrigins.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
Show All 10 Lines |