diff --git a/PKG-INFO b/PKG-INFO index 15cfd4a..031caf9 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,239 +1,257 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.0.30 +Version: 0.0.31 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.debian` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` - `swh.lister.cran` + - `swh.lister.cgit` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` + ## lister-cgit + + Once configured, you can execute a cgit lister using the following instructions + in a `python3` script: + + ```lang=python + import logging + from swh.lister.cgit.tasks import cgit_lister + + logging.basicConfig(level=logging.DEBUG) + # simple cgit instance + cgit_lister(url='https://git.kernel.org/') + # cgit instance whose listed repositories differ from the base url + cgit_lister(url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/') + ``` + Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/README.md b/README.md index afe94f8..b6ee69e 100644 --- a/README.md +++ b/README.md @@ -1,219 +1,237 @@ swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.debian` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` - `swh.lister.cran` +- `swh.lister.cgit` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` +## lister-cgit + +Once configured, you can execute a cgit lister using the following instructions +in a `python3` script: + +```lang=python +import logging +from swh.lister.cgit.tasks import cgit_lister + +logging.basicConfig(level=logging.DEBUG) +# simple cgit instance +cgit_lister(url='https://git.kernel.org/') +# cgit instance whose listed repositories differ from the base url +cgit_lister(url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/') +``` + Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/requirements.txt b/requirements.txt index 3ad87c4..51e86f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ SQLAlchemy arrow python_debian requests setuptools xmltodict iso8601 +beautifulsoup4 diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 15cfd4a..031caf9 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,239 +1,257 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.0.30 +Version: 0.0.31 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest +Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.debian` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` - `swh.lister.cran` + - `swh.lister.cgit` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` + ## lister-cgit + + Once configured, you can execute a cgit lister using the following instructions + in a `python3` script: + + ```lang=python + import logging + from swh.lister.cgit.tasks import cgit_lister + + logging.basicConfig(level=logging.DEBUG) + # simple cgit instance + cgit_lister(url='https://git.kernel.org/') + # cgit instance whose listed repositories differ from the base url + cgit_lister(url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/') + ``` + Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt index 172f102..a8c5e19 100644 --- a/swh.lister.egg-info/SOURCES.txt +++ b/swh.lister.egg-info/SOURCES.txt @@ -1,118 +1,127 @@ MANIFEST.in Makefile README.md requirements-swh.txt requirements-test.txt requirements.txt setup.py version.txt bin/ghlister swh/__init__.py swh.lister.egg-info/PKG-INFO swh.lister.egg-info/SOURCES.txt swh.lister.egg-info/dependency_links.txt swh.lister.egg-info/entry_points.txt swh.lister.egg-info/requires.txt swh.lister.egg-info/top_level.txt swh/lister/__init__.py swh/lister/_version.py swh/lister/cli.py swh/lister/utils.py swh/lister/bitbucket/__init__.py swh/lister/bitbucket/lister.py swh/lister/bitbucket/models.py swh/lister/bitbucket/tasks.py swh/lister/bitbucket/tests/__init__.py swh/lister/bitbucket/tests/api_empty_response.json swh/lister/bitbucket/tests/api_response.json swh/lister/bitbucket/tests/conftest.py swh/lister/bitbucket/tests/test_bb_lister.py swh/lister/bitbucket/tests/test_tasks.py +swh/lister/cgit/__init__.py +swh/lister/cgit/lister.py +swh/lister/cgit/models.py +swh/lister/cgit/tasks.py +swh/lister/cgit/tests/__init__.py +swh/lister/cgit/tests/conftest.py +swh/lister/cgit/tests/test_lister.py +swh/lister/cgit/tests/test_tasks.py swh/lister/core/__init__.py swh/lister/core/abstractattribute.py swh/lister/core/db_utils.py swh/lister/core/indexing_lister.py swh/lister/core/lister_base.py swh/lister/core/lister_transports.py swh/lister/core/models.py swh/lister/core/page_by_page_lister.py swh/lister/core/simple_lister.py swh/lister/core/tests/__init__.py swh/lister/core/tests/conftest.py swh/lister/core/tests/test_abstractattribute.py swh/lister/core/tests/test_lister.py swh/lister/core/tests/test_model.py swh/lister/cran/__init__.py swh/lister/cran/list_all_packages.R swh/lister/cran/lister.py swh/lister/cran/models.py swh/lister/cran/tasks.py swh/lister/cran/tests/__init__.py swh/lister/cran/tests/conftest.py +swh/lister/cran/tests/test_lister.py swh/lister/cran/tests/test_tasks.py swh/lister/debian/__init__.py swh/lister/debian/lister.py swh/lister/debian/tasks.py swh/lister/debian/utils.py swh/lister/debian/tests/__init__.py swh/lister/debian/tests/conftest.py swh/lister/debian/tests/test_tasks.py swh/lister/github/__init__.py swh/lister/github/lister.py swh/lister/github/models.py swh/lister/github/tasks.py swh/lister/github/tests/__init__.py swh/lister/github/tests/api_empty_response.json swh/lister/github/tests/api_response.json swh/lister/github/tests/conftest.py swh/lister/github/tests/test_gh_lister.py swh/lister/github/tests/test_tasks.py swh/lister/gitlab/__init__.py swh/lister/gitlab/lister.py swh/lister/gitlab/models.py swh/lister/gitlab/tasks.py swh/lister/gitlab/tests/__init__.py swh/lister/gitlab/tests/api_empty_response.json swh/lister/gitlab/tests/api_response.json swh/lister/gitlab/tests/conftest.py swh/lister/gitlab/tests/test_gitlab_lister.py swh/lister/gitlab/tests/test_tasks.py swh/lister/gnu/__init__.py swh/lister/gnu/lister.py swh/lister/gnu/models.py swh/lister/gnu/tasks.py swh/lister/gnu/tests/__init__.py swh/lister/gnu/tests/api_response.json swh/lister/gnu/tests/conftest.py swh/lister/gnu/tests/file_structure.json swh/lister/gnu/tests/find_tarballs_output.json swh/lister/gnu/tests/test_lister.py swh/lister/gnu/tests/test_tasks.py swh/lister/npm/__init__.py swh/lister/npm/lister.py swh/lister/npm/models.py swh/lister/npm/tasks.py swh/lister/npm/tests/api_empty_response.json swh/lister/npm/tests/api_inc_empty_response.json swh/lister/npm/tests/api_inc_response.json swh/lister/npm/tests/api_response.json swh/lister/phabricator/__init__.py swh/lister/phabricator/lister.py swh/lister/phabricator/models.py swh/lister/phabricator/tasks.py swh/lister/phabricator/tests/__init__.py swh/lister/phabricator/tests/api_empty_response.json swh/lister/phabricator/tests/api_response.json swh/lister/phabricator/tests/api_response_undefined_protocol.json swh/lister/phabricator/tests/conftest.py swh/lister/phabricator/tests/test_lister.py swh/lister/phabricator/tests/test_tasks.py swh/lister/pypi/__init__.py swh/lister/pypi/lister.py swh/lister/pypi/models.py swh/lister/pypi/tasks.py swh/lister/pypi/tests/__init__.py swh/lister/pypi/tests/conftest.py swh/lister/pypi/tests/test_tasks.py swh/lister/tests/__init__.py swh/lister/tests/test_utils.py \ No newline at end of file diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt index 8ffcc3c..9e82f51 100644 --- a/swh.lister.egg-info/requires.txt +++ b/swh.lister.egg-info/requires.txt @@ -1,17 +1,18 @@ SQLAlchemy arrow python_debian requests setuptools xmltodict iso8601 +beautifulsoup4 swh.core swh.storage>=0.0.122 swh.storage[schemata] swh.scheduler>=0.0.39 [testing] pytest<4 pytest-postgresql requests_mock testing.postgresql diff --git a/swh/lister/_version.py b/swh/lister/_version.py index 687aa5e..cd3676a 100644 --- a/swh/lister/_version.py +++ b/swh/lister/_version.py @@ -1,5 +1,5 @@ # This file is automatically generated by setup.py. -__version__ = '0.0.30' -__sha__ = 'g52b1de8' -__revision__ = 'g52b1de8' +__version__ = '0.0.31' +__sha__ = 'g32c5cf2' +__revision__ = 'g32c5cf2' diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py new file mode 100644 index 0000000..3897adb --- /dev/null +++ b/swh/lister/cgit/lister.py @@ -0,0 +1,251 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +import logging +from bs4 import BeautifulSoup +import requests +from urllib.parse import urlparse + +from .models import CGitModel + +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerOnePageApiTransport + + +class CGitLister(ListerOnePageApiTransport, SimpleLister): + MODEL = CGitModel + LISTER_NAME = 'cgit' + PAGE = None + url_prefix_present = True + + def __init__(self, url, instance=None, url_prefix=None, + override_config=None): + """Inits Class with PAGE url and origin url prefix. + + Args: + url (str): URL of the CGit instance. + instance (str): Name of cgit instance. + url_prefix (str): Prefix of the origin_url. Origin link of the + repos of some special instances do not match + the url of the repository page, they have origin + url in the format /. + + """ + self.PAGE = url + if url_prefix is None: + self.url_prefix = url + self.url_prefix_present = False + else: + self.url_prefix = url_prefix + + if not self.url_prefix.endswith('/'): + self.url_prefix += '/' + url = urlparse(self.PAGE) + self.url_netloc = find_netloc(url) + + if not instance: + instance = url.hostname + self.instance = instance + + ListerOnePageApiTransport .__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def list_packages(self, response): + """List the actual cgit instance origins from the response. + + Find repositories metadata by parsing the html page (response's raw + content). If there are links in the html page, retrieve those + repositories metadata from those pages as well. Return the + repositories as list of dictionaries. + + Args: + response (Response): http api request response. + + Returns: + List of repository origin urls (as dict) included in the response. + + """ + repos_details = [] + + for repo in self.yield_repo_from_responses(response): + repo_name = repo.a.text + origin_url = self.find_origin_url(repo, repo_name) + + try: + time = repo.span['title'] + except Exception: + time = None + + if origin_url is not None: + repos_details.append({ + 'name': repo_name, + 'time': time, + 'origin_url': origin_url, + }) + + random.shuffle(repos_details) + return repos_details + + def yield_repo_from_responses(self, response): + """Yield repositories from all pages of the cgit instance. + + Finds the number of pages present and yields the list of + repositories present. + + Args: + response (Response): server response. + + Yields: + List of beautifulsoup object of repository rows. + + """ + html = response.text + yield from get_repo_list(html) + pages = self.get_pages(make_soup(html)) + if len(pages) > 1: + yield from self.get_repos_from_pages(pages[1:]) + + def find_origin_url(self, repo, repo_name): + """Finds the origin url for a repository + + Args: + repo (Beautifulsoup): Beautifulsoup object of the repository + row present in base url. + repo_name (str): Repository name. + + Returns: + string: origin url. + + """ + if self.url_prefix_present: + return self.url_prefix + repo_name + + return self.get_url(repo) + + def get_pages(self, url_soup): + """Find URL of all pages. + + Finds URL of pages that are present by parsing over the HTML of + pagination present at the end of the page. + + Args: + url_soup (Beautifulsoup): a beautifulsoup object of base URL + + Returns: + list: URL of pages present for a cgit instance + + """ + pages = url_soup.find('div', {"class": "content"}).find_all('li') + + if not pages: + return [self.PAGE] + + return [self.get_url(page) for page in pages] + + def get_repos_from_pages(self, pages): + """Find repos from all pages. + + Request the available repos from the pages. This yields + the available repositories found as beautiful object representation. + + Args: + pages ([str]): list of urls of all pages present for a + particular cgit instance. + + Yields: + List of beautifulsoup object of repository (url) rows + present in pages(except first). + + """ + for page in pages: + response = requests.get(page) + if not response.ok: + logging.warning('Failed to retrieve repositories from page %s', + page) + continue + + yield from get_repo_list(response.text) + + def get_url(self, repo): + """Finds url of a repo page. + + Finds the url of a repo page by parsing over the html of the row of + that repo present in the base url. + + Args: + repo (Beautifulsoup): a beautifulsoup object of the repository + row present in base url. + + Returns: + string: The url of a repo. + + """ + suffix = repo.a['href'] + return self.url_netloc + suffix + + def get_model_from_repo(self, repo): + """Transform from repository representation to model. + + """ + return { + 'uid': self.PAGE + repo['name'], + 'name': repo['name'], + 'full_name': repo['name'], + 'html_url': repo['origin_url'], + 'origin_url': repo['origin_url'], + 'origin_type': 'git', + 'time_updated': repo['time'], + 'instance': self.instance, + } + + def transport_response_simplified(self, repos_details): + """Transform response to list for model manipulation. + + """ + return [self.get_model_from_repo(repo) for repo in repos_details] + + +def find_netloc(url): + """Finds the network location from then url. + + URL in the repo are relative to the network location part of base + URL, so we need to compute it to reconstruct URLs. + + Args: + url (urllib): urllib object of url. + + Returns: + string: Scheme and Network location part in the base URL. + + Example: + For url = https://git.kernel.org/pub/scm/ + >>> find_netloc(url) + 'https://git.kernel.org' + + """ + return '%s://%s' % (url.scheme, url.netloc) + + +def get_repo_list(response): + """Find repositories (as beautifulsoup object) available within the server + response. + + Args: + response (Response): server response + + Returns: + List all repositories as beautifulsoup object within the response. + + """ + repo_soup = make_soup(response) + return repo_soup \ + .find('div', {"class": "content"}).find_all("tr", {"class": ""}) + + +def make_soup(response): + """Instantiates a beautiful soup object from the response object. + + """ + return BeautifulSoup(response, features="html.parser") diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py new file mode 100644 index 0000000..4e16798 --- /dev/null +++ b/swh/lister/cgit/models.py @@ -0,0 +1,18 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class CGitModel(ModelBase): + """a CGit repository representation + + """ + __tablename__ = 'cgit_repo' + + uid = Column(String, primary_key=True) + time_updated = Column(String) + instance = Column(String, index=True) diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py new file mode 100644 index 0000000..31148dd --- /dev/null +++ b/swh/lister/cgit/tasks.py @@ -0,0 +1,25 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from .lister import CGitLister + + +def new_lister(url='https://git.kernel.org/', + url_prefix=None, + instance='kernal', **kw): + return CGitLister(url=url, instance=instance, url_prefix=url_prefix, + **kw) + + +@app.task(name=__name__ + '.CGitListerTask') +def cgit_lister(**lister_args): + lister = new_lister(**lister_args) + lister.run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/cgit/tests/__init__.py b/swh/lister/cgit/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py new file mode 100644 index 0000000..507fef9 --- /dev/null +++ b/swh/lister/cgit/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py new file mode 100644 index 0000000..049893e --- /dev/null +++ b/swh/lister/cgit/tests/test_lister.py @@ -0,0 +1,27 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from urllib.parse import urlparse + +from swh.lister.cgit.lister import find_netloc, get_repo_list + + +def test_get_repo_list(): + f = open('swh/lister/cgit/tests/response.html') + repos = get_repo_list(f.read()) + f = open('swh/lister/cgit/tests/repo_list.txt') + expected_repos = f.readlines() + expected_repos = list(map((lambda repo: repo[:-1]), expected_repos)) + assert len(repos) == len(expected_repos) + for i in range(len(repos)): + assert str(repos[i]) == expected_repos[i] + + +def test_find_netloc(): + first_url = urlparse('http://git.savannah.gnu.org/cgit/') + second_url = urlparse('https://cgit.kde.org/') + + assert find_netloc(first_url) == 'http://git.savannah.gnu.org' + assert find_netloc(second_url) == 'https://cgit.kde.org' diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py new file mode 100644 index 0000000..4a36a05 --- /dev/null +++ b/swh/lister/cgit/tests/test_tasks.py @@ -0,0 +1,53 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.cgit.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.cgit.tasks.CGitLister') +def test_lister_no_url_prefix(lister, swh_app, celery_session_worker): + # setup the mocked CGitLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.cgit.tasks.CGitListerTask', + kwargs=dict(url='https://git.kernel.org/', instance='kernel')) + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with( + url='https://git.kernel.org/', + url_prefix=None, + instance='kernel') + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with() + + +@patch('swh.lister.cgit.tasks.CGitLister') +def test_lister_with_url_prefix(lister, swh_app, celery_session_worker): + # setup the mocked CGitLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.cgit.tasks.CGitListerTask', + kwargs=dict(url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/', instance='kde')) + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with( + url='https://cgit.kde.org/', + url_prefix='https://anongit.kde.org/', + instance='kde') + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with() diff --git a/swh/lister/cli.py b/swh/lister/cli.py index 6bf6801..3a6f38f 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -1,150 +1,158 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click from swh.core.cli import CONTEXT_SETTINGS logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator', 'gnu', 'cran'] + 'npm', 'phabricator', 'gnu', 'cran', 'cgit'] @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @click.pass_context def lister(ctx): '''Software Heritage Lister tools.''' pass @lister.command(name='db-init', context_settings=CONTEXT_SETTINGS) @click.option( '--db-url', '-d', default='postgres:///lister-gitlab.com', help='SQLAlchemy DB URL; see ' '') # noqa @click.argument('listers', required=1, nargs=-1, type=click.Choice(SUPPORTED_LISTERS + ['all'])) @click.option('--drop-tables', '-D', is_flag=True, default=False, help='Drop tables before creating the database schema') @click.pass_context def cli(ctx, db_url, listers, drop_tables): """Initialize the database model for given listers. """ override_conf = { 'lister': { 'cls': 'local', 'args': {'db': db_url} } } if 'all' in listers: listers = SUPPORTED_LISTERS for lister in listers: logger.info('Initializing lister %s', lister) insert_minimum_data = None if lister == 'github': from .github.models import IndexingModelBase as ModelBase from .github.lister import GitHubLister _lister = GitHubLister( api_baseurl='https://api.github.com', override_config=override_conf) elif lister == 'bitbucket': from .bitbucket.models import IndexingModelBase as ModelBase from .bitbucket.lister import BitBucketLister _lister = BitBucketLister( api_baseurl='https://api.bitbucket.org/2.0', override_config=override_conf) elif lister == 'gitlab': from .gitlab.models import ModelBase from .gitlab.lister import GitLabLister _lister = GitLabLister( api_baseurl='https://gitlab.com/api/v4/', override_config=override_conf) elif lister == 'debian': from .debian.lister import DebianLister ModelBase = DebianLister.MODEL # noqa _lister = DebianLister(override_config=override_conf) def insert_minimum_data(lister): from swh.storage.schemata.distribution import ( Distribution, Area) d = Distribution( name='Debian', type='deb', mirror_uri='http://deb.debian.org/debian/') lister.db_session.add(d) areas = [] for distribution_name in ['stretch']: for area_name in ['main', 'contrib', 'non-free']: areas.append(Area( name='%s/%s' % (distribution_name, area_name), distribution=d, )) lister.db_session.add_all(areas) lister.db_session.commit() elif lister == 'pypi': from .pypi.models import ModelBase from .pypi.lister import PyPILister _lister = PyPILister(override_config=override_conf) elif lister == 'npm': from .npm.models import IndexingModelBase as ModelBase from .npm.models import NpmVisitModel from .npm.lister import NpmLister _lister = NpmLister(override_config=override_conf) if drop_tables: NpmVisitModel.metadata.drop_all(_lister.db_engine) NpmVisitModel.metadata.create_all(_lister.db_engine) elif lister == 'phabricator': from .phabricator.models import IndexingModelBase as ModelBase from .phabricator.lister import PhabricatorLister _lister = PhabricatorLister( forge_url='https://forge.softwareheritage.org', api_token='', override_config=override_conf) elif lister == 'gnu': from .gnu.models import ModelBase from .gnu.lister import GNULister _lister = GNULister(override_config=override_conf) elif lister == 'cran': from .cran.models import ModelBase from .cran.lister import CRANLister _lister = CRANLister(override_config=override_conf) + elif lister == 'cgit': + from .cgit.models import ModelBase + from .cgit.lister import CGitLister + _lister = CGitLister( + url='http://git.savannah.gnu.org/cgit/', + url_prefix='http://git.savannah.gnu.org/git/', + override_config=override_conf) + else: raise ValueError( 'Invalid lister %s: only supported listers are %s' % (lister, SUPPORTED_LISTERS)) if drop_tables: logger.info('Dropping tables for %s', lister) ModelBase.metadata.drop_all(_lister.db_engine) logger.info('Creating tables for %s', lister) ModelBase.metadata.create_all(_lister.db_engine) if insert_minimum_data: logger.info('Inserting minimal data for %s', lister) try: insert_minimum_data(_lister) except Exception: logger.warning( 'Failed to insert minimum data in %s', lister) if __name__ == '__main__': cli() diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py index 32b95d4..72f5b97 100644 --- a/swh/lister/core/simple_lister.py +++ b/swh/lister/core/simple_lister.py @@ -1,74 +1,74 @@ # Copyright (C) 2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.core import utils from .lister_base import ListerBase class SimpleLister(ListerBase): """Lister* intermediate class for any service that follows the simple, 'list in oneshot information' pattern. - Client sends a request to list repositories in oneshot - Client receives structured (json/xml/etc) response with information and stores those in db """ def list_packages(self, *args): """Listing packages method. """ pass def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier (unused) checks (bool): Additional checks required (unused) """ response = self.safely_issue_request(identifier) response = self.list_packages(response) if not response: return response, [] models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) all_injected = [] - for models in utils.grouper(models_list, n=10000): + for models in utils.grouper(models_list, n=1000): models = list(models) logging.debug('models: %s' % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.schedule_missing_tasks(models, injected) all_injected.append(injected) # flush self.db_session.commit() self.db_session = self.mk_session() return response, all_injected def run(self): """Query the server which answers in one query. Stores the information, dropping actual redundant information we already have. Returns: nothing """ dump_not_used_identifier = 0 response, injected_repos = self.ingest_data(dump_not_used_identifier) if not response and not injected_repos: logging.info('No response from api server, stopping') diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py index e241d27..b8dd868 100644 --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -1,17 +1,18 @@ import pytest from swh.scheduler.tests.conftest import * # noqa @pytest.fixture(scope='session') def celery_includes(): return [ 'swh.lister.bitbucket.tasks', + 'swh.lister.cgit.tasks', 'swh.lister.cran.tasks', 'swh.lister.debian.tasks', 'swh.lister.github.tasks', 'swh.lister.gitlab.tasks', 'swh.lister.gnu.tasks', 'swh.lister.npm.tasks', 'swh.lister.pypi.tasks', 'swh.lister.phabricator.tasks', ] diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index b25ab3b..55428b5 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,117 +1,121 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json import logging import pkg_resources +from collections import defaultdict from swh.lister.cran.models import CRANModel from swh.scheduler.utils import create_task_dict from swh.core import utils from swh.lister.core.simple_lister import SimpleLister class CRANLister(SimpleLister): MODEL = CRANModel LISTER_NAME = 'cran' instance = 'cran' + descriptions = defaultdict(dict) def task_dict(self, origin_type, origin_url, **kwargs): """Return task format dict This is overridden from the lister_base as more information is needed for the ingestion task creation. """ return create_task_dict( 'load-%s' % origin_type, 'recurring', - kwargs.get('name'), origin_url, kwargs.get('version')) + kwargs.get('name'), origin_url, kwargs.get('version'), + project_metadata=self.descriptions[kwargs.get('name')]) def r_script_request(self): """Runs R script which uses inbuilt API to return a json response containing data about all the R packages Returns: List of dictionaries example [ {'Package': 'A3', 'Version': '1.0.0', 'Title': 'Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels', 'Description': 'Supplies tools for tabulating and analyzing the results of predictive models. The methods employed are ... ' } {'Package': 'abbyyR', 'Version': '0.5.4', 'Title': 'Access to Abbyy Optical Character Recognition (OCR) API', 'Description': 'Get text from images of text using Abbyy Cloud Optical Character\n ...' } ... ] """ file_path = pkg_resources.resource_filename('swh.lister.cran', 'list_all_packages.R') response = subprocess.run(file_path, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout) def get_model_from_repo(self, repo): """Transform from repository representation to model """ + self.descriptions[repo["Package"]] = repo['Description'] project_url = 'https://cran.r-project.org/src/contrib' \ '/%(Package)s_%(Version)s.tar.gz' % repo return { 'uid': repo["Package"], 'name': repo["Package"], 'full_name': repo["Title"], 'version': repo["Version"], 'html_url': project_url, 'origin_url': project_url, 'origin_type': 'cran', } def transport_response_simplified(self, response): """Transform response to list for model manipulation """ return [self.get_model_from_repo(repo) for repo in response] def ingest_data(self, identifier, checks=False): """Rework the base ingest_data. Request server endpoint which gives all in one go. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier (unused) checks (bool): Additional checks required (unused) """ response = self.r_script_request() if not response: return response, [] models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) all_injected = [] for models in utils.grouper(models_list, n=10000): models = list(models) logging.debug('models: %s' % len(models)) # inject into local db injected = self.inject_repo_data_into_db(models) # queue workers self.create_missing_origins_and_tasks(models, injected) all_injected.append(injected) # flush self.db_session.commit() self.db_session = self.mk_session() return response, all_injected diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py new file mode 100644 index 0000000..31552e1 --- /dev/null +++ b/swh/lister/cran/tests/test_lister.py @@ -0,0 +1,13 @@ +from unittest.mock import patch +from swh.lister.cran.lister import CRANLister + + +def test_task_dict(): + lister = CRANLister() + lister.descriptions['test_pack'] = 'Test Description' + with patch('swh.lister.cran.lister.create_task_dict') as mock_create_tasks: + lister.task_dict(origin_type='cran', origin_url='https://abc', + name='test_pack') + mock_create_tasks.assert_called_once_with( + 'load-cran', 'recurring', 'test_pack', 'https://abc', None, + project_metadata='Test Description') diff --git a/version.txt b/version.txt index 1054e71..d2879e5 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.30-0-g52b1de8 \ No newline at end of file +v0.0.31-0-g32c5cf2 \ No newline at end of file