diff --git a/PKG-INFO b/PKG-INFO index 6e7d012..57a85c7 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.lister -Version: 0.0.14 +Version: 0.0.15 Summary: Software Heritage GitHub lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/README.md b/README.md index 3c3878b..0607ce4 100644 --- a/README.md +++ b/README.md @@ -1,162 +1,202 @@ SWH-lister ============ The Software Heritage Lister is both a library module to permit to centralize lister behaviors, and to provide lister implementations. Actual lister implementations are: +- swh-lister-bitbucket - swh-lister-debian - swh-lister-github - swh-lister-gitlab -- swh-lister-bitbucket +- swh-lister-pypi Licensing ---------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ - python3 - python3-requests - python3-sqlalchemy More details in requirements*.txt Local deployment ----------- ## lister-github ### Preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema $ createdb lister-github $ python3 -m swh.lister.cli --db-url postgres:///lister-github \ --lister github \ --create-tables ### Configuration file sample Minimalistic configuration: $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-github credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/github.com + cache_dir: /home/user/.cache/swh/lister/github.com Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 >>> import logging >>> logging.basicConfig(level=logging.DEBUG) >>> from swh.lister.github.tasks import RangeGitHubLister; RangeGitHubLister().run(364, 365) INFO:root:listing repos starting at 364 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 ## lister-gitlab ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ 3. create configuration file ~/.config/swh/lister-gitlab.yml 4. Bootstrap the db instance schema $ createdb lister-gitlab $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \ --lister gitlab \ --create-tables ### Configuration file sample $ cat ~/.config/swh/lister-gitlab.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-gitlab credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/gitlab + cache_dir: /home/user/.cache/swh/lister/gitlab Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task( {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task( {'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20}) ## lister-debian ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/debian/ 3. create configuration file ~/.config/swh/lister-debian.yml 4. Bootstrap the db instance schema $ createdb lister-debian $ python3 -m swh.lister.cli --db-url postgres:///lister-debian \ --lister debian \ --create-tables \ --with-data Note: This bootstraps a minimum data set needed for the debian lister to run (for development) ### Configuration file sample $ cat ~/.config/swh/lister-debian.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-debian credentials: [] cache_responses: True - cache_dir: /home/zack/.cache/swh/lister/debian + cache_dir: /home/user/.cache/swh/lister/debian Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import logging; logging.basicConfig(level=logging.DEBUG); from swh.lister.debian.tasks import DebianListerTask; DebianListerTask().run_task('Debian') DEBUG:root:Creating snapshot for distribution Distribution(Debian (deb) on http://deb.debian.org/debian/) on date 2018-07-27 09:22:50.461165+00:00 DEBUG:root:Processing area Area(stretch/main of Debian) DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325 ... + + +## lister-pypi + +### preparation steps + +1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) +2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/ +3. create configuration file ~/.config/swh/lister-pypi.yml +4. Bootstrap the db instance schema + + $ createdb lister-pypi + $ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \ + --lister pypi \ + --create-tables \ + --with-data + + Note: This bootstraps a minimum data set needed for the pypi + lister to run (for development) + +### Configuration file sample + + $ cat ~/.config/swh/lister-pypi.yml + # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls + lister_db_url: postgres:///lister-pypi + credentials: [] + cache_responses: True + cache_dir: /home/user/.cache/swh/lister/pypi + +Note: This expects storage (5002) and scheduler (5008) services to run locally + +### Run + + $ python3 + Python 3.6.6 (default, Jun 27 2018, 14:44:17) + [GCC 8.1.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task() + >>> diff --git a/debian/rules b/debian/rules index 49bdd90..45428fd 100755 --- a/debian/rules +++ b/debian/rules @@ -1,11 +1,11 @@ #!/usr/bin/make -f export PYBUILD_NAME=swh.lister -export export PYBUILD_TEST_ARGS=--with-doctest -sv -a !db,!fs +export PYBUILD_TEST_ARGS=--with-doctest -sv -a !db,!fs %: dh $@ --with python3 --buildsystem=pybuild override_dh_install: dh_install rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/__init__.py diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 6e7d012..57a85c7 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.lister -Version: 0.0.14 +Version: 0.0.15 Summary: Software Heritage GitHub lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt index 72f9b9e..7d02d7e 100644 --- a/swh.lister.egg-info/SOURCES.txt +++ b/swh.lister.egg-info/SOURCES.txt @@ -1,88 +1,93 @@ .gitignore ACKNOWLEDGEMENTS LICENSE MANIFEST.in Makefile README.md requirements-swh.txt requirements-test.txt requirements.txt setup.py version.txt bin/batch bin/ghlister bin/reset.sh bin/status debian/changelog debian/clean debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/tutorial.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/new_base.png docs/images/new_bitbucket_lister.png docs/images/new_github_lister.png docs/images/old_github_lister.png sql/crawler.sql sql/pimp_db.sql swh/__init__.py swh.lister.egg-info/PKG-INFO swh.lister.egg-info/SOURCES.txt swh.lister.egg-info/dependency_links.txt swh.lister.egg-info/requires.txt swh.lister.egg-info/top_level.txt swh/lister/__init__.py swh/lister/_version.py swh/lister/cli.py swh/lister/utils.py swh/lister/bitbucket/__init__.py swh/lister/bitbucket/lister.py swh/lister/bitbucket/models.py swh/lister/bitbucket/tasks.py swh/lister/bitbucket/tests/__init__.py swh/lister/bitbucket/tests/api_empty_response.json swh/lister/bitbucket/tests/api_response.json swh/lister/bitbucket/tests/test_bb_lister.py swh/lister/core/__init__.py swh/lister/core/abstractattribute.py swh/lister/core/db_utils.py swh/lister/core/indexing_lister.py swh/lister/core/lister_base.py swh/lister/core/lister_transports.py swh/lister/core/models.py swh/lister/core/page_by_page_lister.py +swh/lister/core/simple_lister.py swh/lister/core/tasks.py swh/lister/core/tests/__init__.py swh/lister/core/tests/test_abstractattribute.py swh/lister/core/tests/test_lister.py swh/lister/core/tests/test_model.py swh/lister/debian/__init__.py swh/lister/debian/lister.py swh/lister/debian/tasks.py swh/lister/debian/utils.py swh/lister/github/__init__.py swh/lister/github/lister.py swh/lister/github/models.py swh/lister/github/tasks.py swh/lister/github/tests/__init__.py swh/lister/github/tests/api_empty_response.json swh/lister/github/tests/api_response.json swh/lister/github/tests/test_gh_lister.py swh/lister/gitlab/__init__.py swh/lister/gitlab/lister.py swh/lister/gitlab/models.py swh/lister/gitlab/tasks.py swh/lister/gitlab/tests/__init__.py swh/lister/gitlab/tests/api_empty_response.json swh/lister/gitlab/tests/api_response.json swh/lister/gitlab/tests/test_gitlab_lister.py +swh/lister/pypi/__init__.py +swh/lister/pypi/lister.py +swh/lister/pypi/models.py +swh/lister/pypi/tasks.py swh/lister/tests/__init__.py swh/lister/tests/test_utils.py \ No newline at end of file diff --git a/swh/lister/_version.py b/swh/lister/_version.py index 0265f09..cb9fa9b 100644 --- a/swh/lister/_version.py +++ b/swh/lister/_version.py @@ -1,5 +1,5 @@ # This file is automatically generated by setup.py. -__version__ = '0.0.14' -__sha__ = 'g8b2ee22' -__revision__ = 'g8b2ee22' +__version__ = '0.0.15' +__sha__ = 'gcba22b7' +__revision__ = 'gcba22b7' diff --git a/swh/lister/cli.py b/swh/lister/cli.py index ee7ff09..4997fe6 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,88 +1,93 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click -SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian'] +SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi'] @click.command() @click.option( '--db-url', '-d', default='postgres:///lister-gitlab.com', help='SQLAlchemy DB URL; see ' '') # noqa @click.option('--lister', required=1, type=click.Choice(SUPPORTED_LISTERS), help='Lister to act upon') @click.option('--create-tables', is_flag=True, default=False, help='create tables') @click.option('--drop-tables', is_flag=True, default=False, help='Drop tables') @click.option('--with-data', is_flag=True, default=False, help='Insert minimum required data') def cli(db_url, lister, create_tables, drop_tables, with_data): """Initialize db model according to lister. """ override_conf = {'lister_db_url': db_url} insert_minimum_data = None if lister == 'github': from .github.models import IndexingModelBase as ModelBase from .github.lister import GitHubLister _lister = GitHubLister(api_baseurl='https://api.github.com', override_config=override_conf) elif lister == 'bitbucket': from .bitbucket.models import IndexingModelBase as ModelBase from .bitbucket.lister import BitBucketLister _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0', override_config=override_conf) elif lister == 'gitlab': from .gitlab.models import ModelBase from .gitlab.lister import GitLabLister _lister = GitLabLister(api_baseurl='https://gitlab.com/api/v4/', override_config=override_conf) elif lister == 'debian': from .debian.lister import DebianLister ModelBase = DebianLister.MODEL _lister = DebianLister() def insert_minimum_data(lister): from swh.storage.schemata.distribution import Distribution, Area d = Distribution( name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian/') lister.db_session.add(d) areas = [] for distribution_name in ['stretch']: for area_name in ['main', 'contrib', 'non-free']: areas.append(Area( name='%s/%s' % (distribution_name, area_name), distribution=d, )) lister.db_session.add_all(areas) lister.db_session.commit() + elif lister == 'pypi': + from .pypi.models import ModelBase + from .pypi.lister import PyPiLister + _lister = PyPiLister(override_config=override_conf) + else: raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS) if drop_tables: ModelBase.metadata.drop_all(_lister.db_engine) if create_tables: ModelBase.metadata.create_all(_lister.db_engine) if with_data and insert_minimum_data: insert_minimum_data(_lister) if __name__ == '__main__': cli() diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index d6e85b6..ef59b6f 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,153 +1,232 @@ -# Copyright (C) 2017 the Software Heritage developers +# Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import random from datetime import datetime from email.utils import parsedate from pprint import pformat +from xmlrpc import client import requests import xmltodict try: from swh.lister._version import __version__ except ImportError: __version__ = 'devel' from .abstractattribute import AbstractAttribute from .lister_base import FetchError +class ListerXMLRPCTransport(abc.ABC): + """Use the xmlrpc library for making Lister endpoint requests. + + To be used in conjunction with SWHListerBase or a subclass of it. + """ + SERVER = AbstractAttribute('string containing the server to contact for ' + 'information') + + def __init__(self): + self.lister_version = __version__ + + def get_client(self, path): + """Initialize client to query for result + + """ + return client.ServerProxy(path) + + def request_uri(self, _): + """Same uri called once + + """ + return self.SERVER + + def request_params(self, identifier): + """Cannot pass any parameters to query to the xmlrpc client so cannot + even pass our user-agent specifics. + + """ + return {} + + def transport_quota_check(self, response): + """No rate limit dealing explained. + + """ + return False, 0 + + def transport_request(self, identifier): + """Implements SWHListerBase.transport_request + + """ + path = self.request_uri(identifier) + try: + return self.get_client(path) + except Exception as e: + raise FetchError(e) + + def transport_response_to_string(self, response): + """Implements SWHListerBase.transport_response_to_string for XMLRPC + given responses. + + """ + s = pformat(self.SERVER) + s += '\n#\n' + pformat(response) # Note: will potentially be big + return s + + class SWHListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. To be used in conjunction with SWHListerBase or a subclass of it. """ PATH_TEMPLATE = AbstractAttribute('string containing a python string' ' format pattern that produces the API' ' endpoint path for listing stored' ' repositories when given an index.' ' eg. "/repositories?after=%s".' 'To be implemented in the API-specific' ' class inheriting this.') EXPECTED_STATUS_CODES = (200, 429, 403, 404) def request_headers(self): """Returns dictionary of any request headers needed by the server. MAY BE OVERRIDDEN if request headers are needed. """ return { 'User-Agent': 'Software Heritage lister (%s)' % self.lister_version } def request_uri(self, identifier): """Get the full request URI given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the PATH_TEMPLATE is required. """ path = self.PATH_TEMPLATE % identifier return self.api_baseurl + path def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the request headers is needed. """ params = {} params['headers'] = self.request_headers() or {} creds = self.config['credentials'] auth = random.choice(creds) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params def transport_quota_check(self, response): """Implements SWHListerBase.transport_quota_check with standard 429 code check for HTTP with Requests library. MAY BE OVERRIDDEN if the server notifies about rate limits in a non-standard way that doesn't use HTTP 429 and the Retry-After response header. ( https://tools.ietf.org/html/rfc6585#section-4 ) """ if response.status_code == 429: # HTTP too many requests retry_after = response.headers.get('Retry-After', self.back_off()) try: # might be seconds return True, float(retry_after) except Exception: # might be http-date at_date = datetime(*parsedate(retry_after)[:6]) from_now = (at_date - datetime.today()).total_seconds() + 5 return True, max(0, from_now) else: # response ok self.reset_backoff() return False, 0 def __init__(self, api_baseurl=None): if not api_baseurl: raise NameError('HTTP Lister Transport requires api_baseurl.') self.api_baseurl = api_baseurl # eg. 'https://api.github.com' self.session = requests.Session() self.lister_version = __version__ def _transport_action(self, identifier, method='get'): """Permit to ask information to the api prior to actually executing query. """ path = self.request_uri(identifier) params = self.request_params(identifier) try: if method == 'head': response = self.session.head(path, **params) else: response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: raise FetchError(e) else: if response.status_code not in self.EXPECTED_STATUS_CODES: raise FetchError(response) return response def transport_head(self, identifier): """Retrieve head information on api. """ return self._transport_action(identifier, method='head') def transport_request(self, identifier): """Implements SWHListerBase.transport_request for HTTP using Requests. Retrieve get information on api. """ return self._transport_action(identifier) def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for HTTP given Requests responses. """ s = pformat(response.request.path_url) s += '\n#\n' + pformat(response.request.headers) s += '\n#\n' + pformat(response.status_code) s += '\n#\n' + pformat(response.headers) s += '\n#\n' try: # json? s += pformat(response.json()) except Exception: # not json try: # xml? s += pformat(xmltodict.parse(response.text)) except Exception: # not xml s += pformat(response.text) return s + + +class ListerOnePageApiTransport(SWHListerHttpTransport): + """Leverage requests library to retrieve basic html page and parse + result. + + To be used in conjunction with SWHListerBase or a subclass of it. + + """ + PAGE = AbstractAttribute("The server api's unique page to retrieve and " + "parse for information") + PATH_TEMPLATE = None # we do not use it + + def __init__(self, api_baseurl=None): + self.session = requests.Session() + self.lister_version = __version__ + + def request_uri(self, _): + """Get the full request URI given the transport_request identifier. + + """ + return self.PAGE diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py new file mode 100644 index 0000000..8455a79 --- /dev/null +++ b/swh/lister/core/simple_lister.py @@ -0,0 +1,76 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import logging + +from swh.core import utils + +from .lister_base import SWHListerBase + + +class SimpleLister(SWHListerBase): + """Lister* intermediate class for any service that follows the simple, + 'list in oneshot information' pattern. + + - Client sends a request to list repositories in oneshot + + - Client receives structured (json/xml/etc) response with + information and stores those in db + + """ + @abc.abstractmethod + def list_packages(self, *args): + """Listing packages method. + + """ + pass + + def ingest_data(self, identifier, checks=False): + """Rework the base ingest_data. + Request server endpoint which gives all in one go. + + Simplify and filter response list of repositories. Inject + repo information into local db. Queue loader tasks for + linked repositories. + + Args: + identifier: Resource identifier (unused) + checks (bool): Additional checks required (unused) + + """ + response = self.safely_issue_request(identifier) + response = self.list_packages(response) + if not response: + return response, [] + models_list = self.transport_response_simplified(response) + models_list = self.filter_before_inject(models_list) + all_injected = [] + for models in utils.grouper(models_list, n=10000): + models = list(models) + logging.debug('models: %s' % len(models)) + # inject into local db + injected = self.inject_repo_data_into_db(models) + # queue workers + self.create_missing_origins_and_tasks(models, injected) + all_injected.append(injected) + # flush + self.db_session.commit() + self.db_session = self.mk_session() + + return response, all_injected + + def run(self): + """Query the server which answers in one query. Stores the + information, dropping actual redundant information we + already have. + + Returns: + nothing + + """ + dump_not_used_identifier = 0 + response, injected_repos = self.ingest_data(dump_not_used_identifier) + if not response and not injected_repos: + logging.info('No response from api server, stopping') diff --git a/swh/lister/pypi/__init__.py b/swh/lister/pypi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py new file mode 100644 index 0000000..6d04d78 --- /dev/null +++ b/swh/lister/pypi/lister.py @@ -0,0 +1,76 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import xmltodict + +from .models import PyPiModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerOnePageApiTransport + + +class PyPiLister(ListerOnePageApiTransport, SimpleLister): + MODEL = PyPiModel + LISTER_NAME = 'pypi' + PAGE = 'https://pypi.org/simple/' + + def __init__(self, override_config=None): + ListerOnePageApiTransport .__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """(Override) Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + + """ + _type = 'origin-update-%s' % origin_type + _policy = 'recurring' + project_name = kwargs.get('name') + project_metadata_url = kwargs.get('html_url') + return utils.create_task_dict( + _type, _policy, project_name, origin_url, + project_metadata_url=project_metadata_url) + + def list_packages(self, response): + """(Override) List the actual pypi origins from the response. + + """ + result = xmltodict.parse(response.content) + _packages = [p['#text'] for p in result['html']['body']['a']] + random.shuffle(_packages) + return _packages + + def _compute_urls(self, repo_name): + """Returns a tuple (project_url, project_metadata_url) + + """ + return ( + 'https://pypi.org/pypi/%s/' % repo_name, + 'https://pypi.org/pypi/%s/json' % repo_name + ) + + def get_model_from_repo(self, repo_name): + """(Override) Transform from repository representation to model + + """ + project_url, project_url_meta = self._compute_urls(repo_name) + return { + 'uid': repo_name, + 'name': repo_name, + 'full_name': repo_name, + 'html_url': project_url_meta, + 'origin_url': project_url, + 'origin_type': 'pypi', + 'description': None, + } + + def transport_response_simplified(self, response): + """(Override) Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo_name) for repo_name in response] diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py new file mode 100644 index 0000000..b035f4c --- /dev/null +++ b/swh/lister/pypi/models.py @@ -0,0 +1,16 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class PyPiModel(ModelBase): + """a PyPi repository representation + + """ + __tablename__ = 'pypi_repo' + + uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py new file mode 100644 index 0000000..d8b0e2c --- /dev/null +++ b/swh/lister/pypi/tasks.py @@ -0,0 +1,20 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from ..core.tasks import ListerTaskBase +from .lister import PyPiLister + + +class PyPiListerTask(ListerTaskBase): + """Full PyPi lister (list all available origins from the api). + + """ + task_queue = 'swh_lister_pypi_refresh' + + def new_lister(self): + return PyPiLister() + + def run_task(self): + lister = self.new_lister() + lister.run() diff --git a/version.txt b/version.txt index ca556bc..6528b5a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.14-0-g8b2ee22 \ No newline at end of file +v0.0.15-0-gcba22b7 \ No newline at end of file