diff --git a/README.md b/README.md index 0607ce4..f1a1711 100644 --- a/README.md +++ b/README.md @@ -1,202 +1,202 @@ SWH-lister ============ The Software Heritage Lister is both a library module to permit to centralize lister behaviors, and to provide lister implementations. Actual lister implementations are: - swh-lister-bitbucket - swh-lister-debian - swh-lister-github - swh-lister-gitlab - swh-lister-pypi Licensing ---------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ - python3 - python3-requests - python3-sqlalchemy More details in requirements*.txt Local deployment ----------- ## lister-github ### Preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/github.com/ 3. create configuration file ~/.config/swh/lister-github.com.yml 4. Bootstrap the db instance schema $ createdb lister-github $ python3 -m swh.lister.cli --db-url postgres:///lister-github \ --lister github \ --create-tables ### Configuration file sample Minimalistic configuration: $ cat ~/.config/swh/lister-github.com.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-github credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister/github.com Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 >>> import logging >>> logging.basicConfig(level=logging.DEBUG) >>> from swh.lister.github.tasks import RangeGitHubLister; RangeGitHubLister().run(364, 365) INFO:root:listing repos starting at 364 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repositories?since=364 HTTP/1.1" 200 None DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost DEBUG:urllib3.connectionpool:http://localhost:5002 "POST /origin/add HTTP/1.1" 200 1 ## lister-gitlab ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/gitlab/ 3. create configuration file ~/.config/swh/lister-gitlab.yml 4. Bootstrap the db instance schema $ createdb lister-gitlab $ python3 -m swh.lister.cli --db-url postgres:///lister-gitlab \ --lister gitlab \ --create-tables ### Configuration file sample $ cat ~/.config/swh/lister-gitlab.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-gitlab credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister/gitlab Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> from swh.lister.gitlab.tasks import RangeGitLabLister; RangeGitLabLister().run_task(1, 2, {'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import FullGitLabRelister; FullGitLabRelister().run_task( {'instance':'0xacab', 'api_baseurl':'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20}) >>> from swh.lister.gitlab.tasks import IncrementalGitLabLister; IncrementalGitLabLister().run_task( {'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20}) ## lister-debian ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/debian/ 3. create configuration file ~/.config/swh/lister-debian.yml 4. Bootstrap the db instance schema $ createdb lister-debian $ python3 -m swh.lister.cli --db-url postgres:///lister-debian \ --lister debian \ --create-tables \ --with-data Note: This bootstraps a minimum data set needed for the debian lister to run (for development) ### Configuration file sample $ cat ~/.config/swh/lister-debian.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-debian credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister/debian Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import logging; logging.basicConfig(level=logging.DEBUG); from swh.lister.debian.tasks import DebianListerTask; DebianListerTask().run_task('Debian') DEBUG:root:Creating snapshot for distribution Distribution(Debian (deb) on http://deb.debian.org/debian/) on date 2018-07-27 09:22:50.461165+00:00 DEBUG:root:Processing area Area(stretch/main of Debian) DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): deb.debian.org DEBUG:urllib3.connectionpool:http://deb.debian.org:80 "GET /debian//dists/stretch/main/source/Sources.xz HTTP/1.1" 302 325 ... ## lister-pypi ### preparation steps 1. git clone under $SWH_ENVIRONMENT_HOME/swh-lister (of your choosing) 2. mkdir ~/.config/swh/ ~/.cache/swh/lister/pypi/ 3. create configuration file ~/.config/swh/lister-pypi.yml 4. Bootstrap the db instance schema $ createdb lister-pypi $ python3 -m swh.lister.cli --db-url postgres:///lister-pypi \ --lister pypi \ --create-tables \ --with-data Note: This bootstraps a minimum data set needed for the pypi lister to run (for development) ### Configuration file sample $ cat ~/.config/swh/lister-pypi.yml # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls lister_db_url: postgres:///lister-pypi credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister/pypi Note: This expects storage (5002) and scheduler (5008) services to run locally ### Run $ python3 Python 3.6.6 (default, Jun 27 2018, 14:44:17) [GCC 8.1.0] on linux Type "help", "copyright", "credits" or "license" for more information. - >>> from swh.lister.pypi.tasks import PyPiListerTask; PyPiListerTask().run_task() + >>> from swh.lister.pypi.tasks import PyPIListerTask; PyPIListerTask().run_task() >>> diff --git a/swh/lister/cli.py b/swh/lister/cli.py index 4997fe6..c5503ec 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,93 +1,93 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi'] @click.command() @click.option( '--db-url', '-d', default='postgres:///lister-gitlab.com', help='SQLAlchemy DB URL; see ' '') # noqa @click.option('--lister', required=1, type=click.Choice(SUPPORTED_LISTERS), help='Lister to act upon') @click.option('--create-tables', is_flag=True, default=False, help='create tables') @click.option('--drop-tables', is_flag=True, default=False, help='Drop tables') @click.option('--with-data', is_flag=True, default=False, help='Insert minimum required data') def cli(db_url, lister, create_tables, drop_tables, with_data): """Initialize db model according to lister. """ override_conf = {'lister_db_url': db_url} insert_minimum_data = None if lister == 'github': from .github.models import IndexingModelBase as ModelBase from .github.lister import GitHubLister _lister = GitHubLister(api_baseurl='https://api.github.com', override_config=override_conf) elif lister == 'bitbucket': from .bitbucket.models import IndexingModelBase as ModelBase from .bitbucket.lister import BitBucketLister _lister = BitBucketLister(api_baseurl='https://api.bitbucket.org/2.0', override_config=override_conf) elif lister == 'gitlab': from .gitlab.models import ModelBase from .gitlab.lister import GitLabLister _lister = GitLabLister(api_baseurl='https://gitlab.com/api/v4/', override_config=override_conf) elif lister == 'debian': from .debian.lister import DebianLister ModelBase = DebianLister.MODEL _lister = DebianLister() def insert_minimum_data(lister): from swh.storage.schemata.distribution import Distribution, Area d = Distribution( name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian/') lister.db_session.add(d) areas = [] for distribution_name in ['stretch']: for area_name in ['main', 'contrib', 'non-free']: areas.append(Area( name='%s/%s' % (distribution_name, area_name), distribution=d, )) lister.db_session.add_all(areas) lister.db_session.commit() elif lister == 'pypi': from .pypi.models import ModelBase - from .pypi.lister import PyPiLister - _lister = PyPiLister(override_config=override_conf) + from .pypi.lister import PyPILister + _lister = PyPILister(override_config=override_conf) else: raise ValueError('Only supported listers are %s' % SUPPORTED_LISTERS) if drop_tables: ModelBase.metadata.drop_all(_lister.db_engine) if create_tables: ModelBase.metadata.create_all(_lister.db_engine) if with_data and insert_minimum_data: insert_minimum_data(_lister) if __name__ == '__main__': cli() diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 6d04d78..4d1b54e 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,76 +1,76 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import random import xmltodict -from .models import PyPiModel +from .models import PyPIModel from swh.scheduler import utils from swh.lister.core.simple_lister import SimpleLister from swh.lister.core.lister_transports import ListerOnePageApiTransport -class PyPiLister(ListerOnePageApiTransport, SimpleLister): - MODEL = PyPiModel +class PyPILister(ListerOnePageApiTransport, SimpleLister): + MODEL = PyPIModel LISTER_NAME = 'pypi' PAGE = 'https://pypi.org/simple/' def __init__(self, override_config=None): ListerOnePageApiTransport .__init__(self) SimpleLister.__init__(self, override_config=override_config) def task_dict(self, origin_type, origin_url, **kwargs): """(Override) Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ _type = 'origin-update-%s' % origin_type _policy = 'recurring' project_name = kwargs.get('name') project_metadata_url = kwargs.get('html_url') return utils.create_task_dict( _type, _policy, project_name, origin_url, project_metadata_url=project_metadata_url) def list_packages(self, response): """(Override) List the actual pypi origins from the response. """ result = xmltodict.parse(response.content) _packages = [p['#text'] for p in result['html']['body']['a']] random.shuffle(_packages) return _packages def _compute_urls(self, repo_name): """Returns a tuple (project_url, project_metadata_url) """ return ( 'https://pypi.org/pypi/%s/' % repo_name, 'https://pypi.org/pypi/%s/json' % repo_name ) def get_model_from_repo(self, repo_name): """(Override) Transform from repository representation to model """ project_url, project_url_meta = self._compute_urls(repo_name) return { 'uid': repo_name, 'name': repo_name, 'full_name': repo_name, 'html_url': project_url_meta, 'origin_url': project_url, 'origin_type': 'pypi', 'description': None, } def transport_response_simplified(self, response): """(Override) Transform response to list for model manipulation """ return [self.get_model_from_repo(repo_name) for repo_name in response] diff --git a/swh/lister/pypi/models.py b/swh/lister/pypi/models.py index b035f4c..f34eef9 100644 --- a/swh/lister/pypi/models.py +++ b/swh/lister/pypi/models.py @@ -1,16 +1,16 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from sqlalchemy import Column, String from ..core.models import ModelBase -class PyPiModel(ModelBase): - """a PyPi repository representation +class PyPIModel(ModelBase): + """a PyPI repository representation """ __tablename__ = 'pypi_repo' uid = Column(String, primary_key=True) diff --git a/swh/lister/pypi/tasks.py b/swh/lister/pypi/tasks.py index d8b0e2c..df2d275 100644 --- a/swh/lister/pypi/tasks.py +++ b/swh/lister/pypi/tasks.py @@ -1,20 +1,20 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from ..core.tasks import ListerTaskBase -from .lister import PyPiLister +from .lister import PyPILister -class PyPiListerTask(ListerTaskBase): - """Full PyPi lister (list all available origins from the api). +class PyPIListerTask(ListerTaskBase): + """Full PyPI lister (list all available origins from the api). """ task_queue = 'swh_lister_pypi_refresh' def new_lister(self): - return PyPiLister() + return PyPILister() def run_task(self): lister = self.new_lister() lister.run()