diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 99d8013..09a731d 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,71 +1,65 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import xmltodict from .models import PyPIModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.core.lister_transports import ListerOnePageApiTransport class PyPILister(ListerOnePageApiTransport, SimpleLister): MODEL = PyPIModel LISTER_NAME = 'pypi' PAGE = 'https://pypi.org/simple/' instance = 'pypi' # As of today only the main pypi.org is used def __init__(self, override_config=None): ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ _type = 'load-%s' % origin_type _policy = kwargs.get('policy', 'recurring') - project_name = kwargs.get('name') - project_metadata_url = kwargs.get('html_url') return utils.create_task_dict( - _type, _policy, project_name, origin_url, - project_metadata_url=project_metadata_url) + _type, _policy, url=origin_url) def list_packages(self, response): """(Override) List the actual pypi origins from the response. """ result = xmltodict.parse(response.content) _packages = [p['#text'] for p in result['html']['body']['a']] random.shuffle(_packages) return _packages - def _compute_urls(self, repo_name): - """Returns a tuple (project_url, project_metadata_url) + def origin_url(self, repo_name: str) -> str: + """Returns origin_url """ - return ( - 'https://pypi.org/project/%s/' % repo_name, - 'https://pypi.org/pypi/%s/json' % repo_name - ) + return 'https://pypi.org/project/%s/' % repo_name def get_model_from_repo(self, repo_name): """(Override) Transform from repository representation to model """ - project_url, project_url_meta = self._compute_urls(repo_name) + origin_url = self.origin_url(repo_name) return { - 'uid': repo_name, + 'uid': origin_url, 'name': repo_name, 'full_name': repo_name, - 'html_url': project_url_meta, - 'origin_url': project_url, + 'html_url': origin_url, + 'origin_url': origin_url, 'origin_type': 'pypi', } diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py index a67dd22..6f7fc4d 100644 --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -1,29 +1,27 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def test_pypi_lister(lister_pypi, requests_mock_datadir): lister_pypi.run() r = lister_pypi.scheduler.search_tasks(task_type='load-pypi') assert len(r) == 4 for row in r: assert row['type'] == 'load-pypi' # arguments check args = row['arguments']['args'] - assert len(args) == 2 - - project = args[0] - url = args[1] - assert url == 'https://pypi.org/project/%s/' % project + assert len(args) == 0 # kwargs kwargs = row['arguments']['kwargs'] - meta_url = kwargs['project_metadata_url'] - assert meta_url == 'https://pypi.org/pypi/%s/json' % project + assert len(kwargs) == 1 + + origin_url = kwargs['url'] + assert 'https://pypi.org/project' in origin_url assert row['policy'] == 'recurring' assert row['priority'] is None