diff --git a/mypy.ini b/mypy.ini index c84a7e7..8aab2fa 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,41 +1,38 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-bs4.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-debian.*] ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-launchpadlib.*] ignore_missing_imports = True [mypy-lazr.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True - -[mypy-xmltodict.*] -ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 34bf339..4f6c24e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ python_debian requests setuptools -xmltodict iso8601 beautifulsoup4 launchpadlib tenacity diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index ae9874b..ad52e22 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,76 +1,77 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Iterator, List, Optional +from bs4 import BeautifulSoup import requests -import xmltodict from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) PackageListPage = List[str] class PyPILister(StatelessLister[PackageListPage]): """List origins from PyPI. """ LISTER_NAME = "pypi" INSTANCE = "pypi" # As of today only the main pypi.org is used PACKAGE_LIST_URL = "https://pypi.org/simple/" PACKAGE_URL = "https://pypi.org/project/{package_name}/" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, url=self.PACKAGE_LIST_URL, instance=self.INSTANCE, credentials=credentials, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/html", "User-Agent": USER_AGENT} ) def get_pages(self) -> Iterator[PackageListPage]: response = self.session.get(self.PACKAGE_LIST_URL) response.raise_for_status() - page_xmldict = xmltodict.parse(response.content) - page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]] + page = BeautifulSoup(response.content, features="html.parser") + + page_results = [p.text for p in page.find_all("a")] yield page_results def get_origins_from_page( self, packages_name: PackageListPage ) -> Iterator[ListedOrigin]: """Convert a page of PyPI repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for package_name in packages_name: package_url = self.PACKAGE_URL.format(package_name=package_name) yield ListedOrigin( lister_id=self.lister_obj.id, url=package_url, visit_type="pypi", last_update=None, # available on PyPI JSON API )