diff --git a/PKG-INFO b/PKG-INFO index 8be68c1..8d4b074 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,275 +1,275 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.6.1 +Version: 0.7.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` ## lister-cgit Once configured, you can execute a cgit lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cgit.tasks import cgit_lister logging.basicConfig(level=logging.DEBUG) # simple cgit instance cgit_lister(url='https://git.kernel.org/') # cgit instance whose listed repositories differ from the base url cgit_lister(url='https://cgit.kde.org/', url_prefix='https://anongit.kde.org/') ``` ## lister-packagist Once configured, you can execute a Packagist lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.packagist.tasks import packagist_lister logging.basicConfig(level=logging.DEBUG) packagist_lister() ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 8be68c1..8d4b074 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,275 +1,275 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.6.1 +Version: 0.7.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` ## lister-cgit Once configured, you can execute a cgit lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cgit.tasks import cgit_lister logging.basicConfig(level=logging.DEBUG) # simple cgit instance cgit_lister(url='https://git.kernel.org/') # cgit instance whose listed repositories differ from the base url cgit_lister(url='https://cgit.kde.org/', url_prefix='https://anongit.kde.org/') ``` ## lister-packagist Once configured, you can execute a Packagist lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.packagist.tasks import packagist_lister logging.basicConfig(level=logging.DEBUG) packagist_lister() ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt index dc907b8..9032074 100644 --- a/swh.lister.egg-info/SOURCES.txt +++ b/swh.lister.egg-info/SOURCES.txt @@ -1,198 +1,197 @@ .gitignore .pre-commit-config.yaml ACKNOWLEDGEMENTS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.md conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/run_a_new_lister.rst docs/tutorial.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/new_base.png docs/images/new_bitbucket_lister.png docs/images/new_github_lister.png docs/images/old_github_lister.png sql/crawler.sql sql/pimp_db.sql swh/__init__.py swh.lister.egg-info/PKG-INFO swh.lister.egg-info/SOURCES.txt swh.lister.egg-info/dependency_links.txt swh.lister.egg-info/entry_points.txt swh.lister.egg-info/requires.txt swh.lister.egg-info/top_level.txt swh/lister/__init__.py swh/lister/cli.py swh/lister/pattern.py swh/lister/py.typed swh/lister/pytest_plugin.py swh/lister/utils.py swh/lister/bitbucket/__init__.py swh/lister/bitbucket/lister.py swh/lister/bitbucket/tasks.py swh/lister/bitbucket/tests/__init__.py swh/lister/bitbucket/tests/test_lister.py swh/lister/bitbucket/tests/test_tasks.py swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json swh/lister/bitbucket/tests/data/bb_api_repositories_page2.json swh/lister/cgit/__init__.py swh/lister/cgit/lister.py swh/lister/cgit/tasks.py swh/lister/cgit/tests/__init__.py swh/lister/cgit/tests/repo_list.txt swh/lister/cgit/tests/test_lister.py swh/lister/cgit/tests/test_tasks.py +swh/lister/cgit/tests/data/https_git.baserock.org/cgit +swh/lister/cgit/tests/data/https_git.eclipse.org/c swh/lister/cgit/tests/data/https_git.savannah.gnu.org/README swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit_elisp-es.git swh/lister/cgit/tests/data/https_git.tizen/README swh/lister/cgit/tests/data/https_git.tizen/cgit swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=100 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=50 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Projects swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Users swh/lister/cgit/tests/data/https_git.tizen/cgit_Lock-Projects swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-base swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-mc1n2 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e3250 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e4x12 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_devices_nfc-plugin-nxp swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_intel_mfld_bootstub-mfld-blackbay swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_mtdev swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_opengl-es-virtual-drv swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libdrm swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libnl swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_xorg_driver_xserver-xorg-misc swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-gallery-efl swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-homescreen-efl +swh/lister/cgit/tests/data/https_jff.email/cgit swh/lister/core/__init__.py swh/lister/core/abstractattribute.py swh/lister/core/lister_base.py swh/lister/core/lister_transports.py swh/lister/core/models.py swh/lister/core/simple_lister.py swh/lister/core/tests/__init__.py swh/lister/core/tests/test_abstractattribute.py swh/lister/core/tests/test_lister.py swh/lister/core/tests/test_model.py swh/lister/cran/__init__.py swh/lister/cran/list_all_packages.R swh/lister/cran/lister.py swh/lister/cran/tasks.py swh/lister/cran/tests/__init__.py swh/lister/cran/tests/test_lister.py swh/lister/cran/tests/test_tasks.py swh/lister/cran/tests/data/list-r-packages.json swh/lister/debian/__init__.py swh/lister/debian/lister.py swh/lister/debian/tasks.py swh/lister/debian/tests/__init__.py swh/lister/debian/tests/test_lister.py swh/lister/debian/tests/test_tasks.py swh/lister/debian/tests/data/Sources_bullseye swh/lister/debian/tests/data/Sources_buster swh/lister/debian/tests/data/Sources_stretch swh/lister/gitea/__init__.py swh/lister/gitea/lister.py swh/lister/gitea/tasks.py swh/lister/gitea/tests/__init__.py swh/lister/gitea/tests/test_lister.py swh/lister/gitea/tests/test_tasks.py swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2 swh/lister/github/__init__.py swh/lister/github/lister.py swh/lister/github/tasks.py swh/lister/github/tests/__init__.py swh/lister/github/tests/test_lister.py swh/lister/github/tests/test_tasks.py swh/lister/gitlab/__init__.py swh/lister/gitlab/lister.py swh/lister/gitlab/tasks.py swh/lister/gitlab/tests/__init__.py swh/lister/gitlab/tests/test_lister.py swh/lister/gitlab/tests/test_tasks.py swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json swh/lister/gitlab/tests/data/https_gitlab.com/api_response_page1.json swh/lister/gnu/__init__.py swh/lister/gnu/lister.py -swh/lister/gnu/models.py swh/lister/gnu/tasks.py swh/lister/gnu/tree.py swh/lister/gnu/tests/__init__.py -swh/lister/gnu/tests/api_response.json -swh/lister/gnu/tests/conftest.py -swh/lister/gnu/tests/find_tarballs_output.json swh/lister/gnu/tests/test_lister.py swh/lister/gnu/tests/test_tasks.py swh/lister/gnu/tests/test_tree.py swh/lister/gnu/tests/data/tree.json swh/lister/gnu/tests/data/tree.min.json swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz swh/lister/launchpad/__init__.py swh/lister/launchpad/lister.py swh/lister/launchpad/tasks.py swh/lister/launchpad/tests/__init__.py swh/lister/launchpad/tests/conftest.py swh/lister/launchpad/tests/test_lister.py swh/lister/launchpad/tests/test_tasks.py swh/lister/launchpad/tests/data/launchpad_response1.json swh/lister/launchpad/tests/data/launchpad_response2.json swh/lister/npm/__init__.py swh/lister/npm/lister.py swh/lister/npm/tasks.py swh/lister/npm/tests/test_lister.py swh/lister/npm/tests/test_tasks.py swh/lister/npm/tests/data/npm_full_page1.json swh/lister/npm/tests/data/npm_full_page2.json swh/lister/npm/tests/data/npm_incremental_page1.json swh/lister/npm/tests/data/npm_incremental_page2.json swh/lister/packagist/__init__.py swh/lister/packagist/lister.py swh/lister/packagist/models.py swh/lister/packagist/tasks.py swh/lister/packagist/tests/__init__.py swh/lister/packagist/tests/conftest.py swh/lister/packagist/tests/test_lister.py swh/lister/packagist/tests/test_tasks.py swh/lister/packagist/tests/data/https_packagist.org/packages_list.json swh/lister/phabricator/__init__.py swh/lister/phabricator/lister.py swh/lister/phabricator/tasks.py swh/lister/phabricator/tests/__init__.py swh/lister/phabricator/tests/test_lister.py swh/lister/phabricator/tests/test_tasks.py swh/lister/phabricator/tests/data/__init__.py swh/lister/phabricator/tests/data/phabricator_api_repositories_page1.json swh/lister/phabricator/tests/data/phabricator_api_repositories_page2.json swh/lister/pypi/__init__.py swh/lister/pypi/lister.py swh/lister/pypi/tasks.py swh/lister/pypi/tests/__init__.py swh/lister/pypi/tests/test_lister.py swh/lister/pypi/tests/test_tasks.py swh/lister/pypi/tests/data/https_pypi.org/simple swh/lister/tests/__init__.py swh/lister/tests/test_cli.py swh/lister/tests/test_pattern.py swh/lister/tests/test_utils.py \ No newline at end of file diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 0253a32..646efa5 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,187 +1,216 @@ # Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup import requests from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) Repositories = List[Dict[str, Any]] class CGitLister(StatelessLister[Repositories]): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. - For each found git repository, a query is made at the given url found - in this index to gather published "Clone" URLs to be used as origin - URL for that git repo. + The lister currently defines 2 listing behaviors: + + - If the `base_git_url` is provided, the listed origin urls are computed out of the + base git url link and the one listed in the main listed page (resulting in less + HTTP queries than the 2nd behavior below). This is expected to be the main + deployed behavior. + + - Otherwise (with no `base_git_url`), for each found git repository listed, one + extra HTTP query is made at the given url found in the main listing page to gather + published "Clone" URLs to be used as origin URL for that git repo. If several + "Clone" urls are provided, prefer the http/https one, if any, otherwise fallback + to the first one. - If several "Clone" urls are provided, prefer the http/https one, if - any, otherwise fallback to the first one. """ LISTER_NAME = "cgit" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + base_git_url: Optional[str] = None, ): """Lister class for CGit repositories. Args: - url (str): main URL of the CGit instance, i.e. url of the index + url: main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. - instance (str): Name of cgit instance. Defaults to url's hostname + instance: Name of cgit instance. Defaults to url's hostname if unset. + base_git_url: Optional base git url which allows the origin url + computations. """ if not instance: instance = urlparse(url).hostname assert instance is not None # Make mypy happy super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/html", "User-Agent": USER_AGENT} ) + self.base_git_url = base_git_url def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) response.raise_for_status() return BeautifulSoup(response.text, features="html.parser") def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current CGit server The last_update date is retrieved on the list of repo page to avoid to compute it on the repository details which only give a date per branch """ next_page: Optional[str] = self.url while next_page: bs_idx = self._get_and_parse(next_page) page_results = [] for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): - url = urljoin(self.url, tr.find("a")["href"]) + repository_link = tr.find("a")["href"] + repo_url = None + git_url = None + + base_url = urljoin(self.url, repository_link) + if self.base_git_url: # mapping provided + # computing git url + git_url = base_url.replace(self.url, self.base_git_url) + else: + # we compute the git detailed page url from which we will retrieve + # the git url (cf. self.get_origins_from_page) + repo_url = base_url + span = tr.find("span", {"class": re.compile("age-")}) if span: last_updated_date = span["title"] else: last_updated_date = None page_results.append( - {"url": url, "last_updated_date": last_updated_date} + { + "url": repo_url, + "git_url": git_url, + "last_updated_date": last_updated_date, + } ) yield page_results try: pager = bs_idx.find("ul", {"class": "pager"}) current_page = pager.find("a", {"class": "current"}) if current_page: next_page = current_page.parent.next_sibling.a["href"] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page next_page = None def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of cgit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None - for repository in repositories: - origin_url = self._get_origin_from_repository_url(repository["url"]) + for repo in repositories: + origin_url = repo["git_url"] or self._get_origin_from_repository_url( + repo["url"] + ) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", - last_update=_parse_last_updated_date(repository), + last_update=_parse_last_updated_date(repo), ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" try: bs = self._get_and_parse(repository_url) except HTTPError as e: logger.warning( "Unexpected HTTP status code %s on %s", e.response.status_code, e.response.url, ) return None # origin urls are listed on the repository page # TODO check if forcing https is better or not ? # # # urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: return None # look for the http/https url, if any, and use it as origin_url for url in urls: if urlparse(url).scheme in ("http", "https"): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] return origin_url def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: """Parse the last updated date""" date = repository.get("last_updated_date") if not date: return None parsed_date = None for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"): try: parsed_date = datetime.strptime(date, date_format) # force UTC to avoid naive datetime if not parsed_date.tzinfo: parsed_date = parsed_date.replace(tzinfo=timezone.utc) break except Exception: pass if not parsed_date: logger.warning( "Could not parse %s last_updated date: %s", repository["url"], date, ) return parsed_date diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py index e6ef159..9a95648 100644 --- a/swh/lister/cgit/tasks.py +++ b/swh/lister/cgit/tasks.py @@ -1,21 +1,25 @@ # Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, Optional from celery import shared_task from .lister import CGitLister @shared_task(name=__name__ + ".CGitListerTask") -def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]: +def list_cgit( + url: str, instance: Optional[str] = None, base_git_url: Optional[str] = None +) -> Dict[str, str]: """Lister task for CGit instances""" - lister = CGitLister.from_configfile(url=url, instance=instance) + lister = CGitLister.from_configfile( + url=url, instance=instance, base_git_url=base_git_url + ) return lister.run().dict() @shared_task(name=__name__ + ".ping") def _ping(): return "OK" diff --git a/swh/lister/cgit/tests/data/https_git.baserock.org/cgit b/swh/lister/cgit/tests/data/https_git.baserock.org/cgit new file mode 100644 index 0000000..46fde7e --- /dev/null +++ b/swh/lister/cgit/tests/data/https_git.baserock.org/cgit @@ -0,0 +1,33 @@ + + + + Lorry Depot + + + + + + + + + diff --git a/swh/lister/cgit/tests/data/https_git.eclipse.org/c b/swh/lister/cgit/tests/data/https_git.eclipse.org/c new file mode 100644 index 0000000..8cd1d4b --- /dev/null +++ b/swh/lister/cgit/tests/data/https_git.eclipse.org/c @@ -0,0 +1,42 @@ + + + + Eclipse Git repositories + + + + + + + + + +
+
+ +
+

+ Back to the top +

+ + diff --git a/swh/lister/cgit/tests/data/https_jff.email/cgit b/swh/lister/cgit/tests/data/https_jff.email/cgit new file mode 100644 index 0000000..ebc0289 --- /dev/null +++ b/swh/lister/cgit/tests/data/https_jff.email/cgit @@ -0,0 +1,33 @@ + + + + Jörgs Debian Repository + + + + + + + + + diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py index cf4c9fc..f963a4e 100644 --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -1,198 +1,233 @@ # Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from typing import List import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.lister import __version__ from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date from swh.lister.pattern import ListerStats def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler): url = "https://git.savannah.gnu.org/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) repos: List[List[str]] = list(lister_cgit.get_pages()) flattened_repos = sum(repos, []) assert len(flattened_repos) == 977 assert ( flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" ) # note the url below is NOT a subpath of /cgit/ assert ( flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git/" ) # noqa # note the url below is NOT on the same server assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git/" def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler): url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) repos: List[List[str]] = list(lister_cgit.get_pages()) flattened_repos = sum(repos, []) # we should have 16 repos (listed on 3 pages) assert len(repos) == 3 assert len(flattened_repos) == 16 def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler): """cgit lister supports pagination""" url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 16 assert stats == ListerStats(pages=3, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith("https://git.tizen") # test user agent content assert len(requests_mock_datadir.request_history) != 0 for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] assert "Software Heritage Lister" in user_agent assert __version__ in user_agent def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler): """cgit lister returns last updated date""" url = "https://git.tizen/cgit" urls_without_date = [ f"https://git.tizen.org/cgit/{suffix_url}" for suffix_url in ["All-Projects", "All-Users", "Lock-Projects",] ] lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 16 assert stats == ListerStats(pages=3, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: if listed_origin.url in urls_without_date: assert listed_origin.last_update is None else: assert listed_origin.last_update is not None @pytest.mark.parametrize( "date_str,expected_date", [ ({}, None), ("unexpected date", None), ("2020-0140-10 10:10:10 (GMT)", None), ( "2020-01-10 10:10:10 (GMT)", datetime( year=2020, month=1, day=10, hour=10, minute=10, second=10, tzinfo=timezone.utc, ), ), ( "2019-08-04 05:10:41 +0100", datetime( year=2019, month=8, day=4, hour=5, minute=10, second=41, tzinfo=timezone(timedelta(hours=1)), ), ), ], ) def test_lister_cgit_date_parsing(date_str, expected_date): """test cgit lister date parsing""" repository = {"url": "url", "last_updated_date": date_str} assert _parse_last_updated_date(repository) == expected_date requests_mock_datadir_missing_url = requests_mock_datadir_factory( ignore_urls=["https://git.tizen/cgit/adaptation/ap_samsung/audio-hal-e4x12/",] ) def test_lister_cgit_get_origin_from_repo_failing( requests_mock_datadir_missing_url, swh_scheduler ): url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 15 assert stats == ListerStats(pages=3, origins=expected_nb_origins) @pytest.mark.parametrize( "credentials, expected_credentials", [ (None, []), ({"key": "value"}, []), ( {"cgit": {"tizen": [{"username": "user", "password": "pass"}]}}, [{"username": "user", "password": "pass"}], ), ], ) def test_lister_cgit_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): url = "https://git.tizen/cgit/" lister = CGitLister( swh_scheduler, url=url, instance="tizen", credentials=credentials ) # Credentials are allowed in constructor assert lister.credentials == expected_credentials def test_lister_cgit_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "url": "https://git.tizen/cgit/", "instance": "tizen", "credentials": {}, } lister = CGitLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None + + +@pytest.mark.parametrize( + "url,base_git_url,expected_nb_origins", + [ + ("https://git.eclipse.org/c", "https://eclipse.org/r", 5), + ("https://git.baserock.org/cgit/", "https://git.baserock.org/git/", 3), + ("https://jff.email/cgit/", "git://jff.email/opt/git/", 6), + ], +) +def test_lister_cgit_with_base_git_url( + url, base_git_url, expected_nb_origins, requests_mock_datadir, swh_scheduler +): + """With base git url provided, listed urls should be the computed origin urls + + """ + lister_cgit = CGitLister(swh_scheduler, url=url, base_git_url=base_git_url,) + + stats = lister_cgit.run() + + assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_cgit.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith(base_git_url) + assert ( + listed_origin.url.startswith(url) is False + ), f"url should be mapped to {base_git_url}" diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py index f348221..b9a00cd 100644 --- a/swh/lister/cgit/tests/test_tasks.py +++ b/swh/lister/cgit/tests/test_tasks.py @@ -1,35 +1,35 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.pattern import ListerStats def test_cgit_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.cgit.tasks.ping") assert res res.wait() assert res.successful() assert res.result == "OK" def test_cgit_lister_task( swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker ): # setup the mocked CGitLister lister = mocker.patch("swh.lister.cgit.tasks.CGitLister") lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=10, origins=500) - kwargs = dict(url="https://git.kernel.org/", instance="kernel") + kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None) res = swh_scheduler_celery_app.send_task( "swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs, ) assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with() diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py index eec08d4..e750c90 100644 --- a/swh/lister/gnu/__init__.py +++ b/swh/lister/gnu/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import GNULister - from .models import GNUModel return { - "models": [GNUModel], "lister": GNULister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index e6582fb..7d976ee 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,112 +1,68 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from typing import Any, Dict, List +from typing import Any, Iterator, Mapping -from requests import Response +import iso8601 -from swh.lister.core.simple_lister import SimpleLister -from swh.lister.gnu.models import GNUModel -from swh.lister.gnu.tree import GNUTree -from swh.scheduler import utils +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin -logger = logging.getLogger(__name__) - - -class GNULister(SimpleLister): - MODEL = GNUModel - LISTER_NAME = "gnu" - instance = "gnu" +from ..pattern import CredentialsType, StatelessLister +from .tree import GNUTree - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz") +logger = logging.getLogger(__name__) - def task_dict(self, origin_type, origin_url, **kwargs): - """Return task format dict +GNUPageType = Mapping[str, Any] - This is overridden from the lister_base as more information is - needed for the ingestion task creation. - This creates tasks with args and kwargs set, for example: +class GNULister(StatelessLister[GNUPageType]): + """ + List all GNU projects and associated artifacts. + """ - .. code-block:: python + LISTER_NAME = "GNU" + GNU_FTP_URL = "https://ftp.gnu.org" - args: - kwargs: { - 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'artifacts': [{ - 'url': 'https://...', - 'time': '2003-12-09T21:43:20+00:00', - 'length': 128, - 'version': '1.0.1', - 'filename': 'something-1.0.1.tar.gz', - }, - ... - ] - } + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.GNU_FTP_URL, + instance="GNU", + credentials=credentials, + ) + self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") + def get_pages(self) -> Iterator[GNUPageType]: """ - artifacts = self.gnu_tree.artifacts[origin_url] - assert origin_type == "tar" - return utils.create_task_dict( - "load-archive-files", - kwargs.get("policy", "oneshot"), - url=origin_url, - artifacts=artifacts, - retries_left=3, - ) + Yield a single page listing all GNU projects. + """ + yield self.gnu_tree.projects - def safely_issue_request(self, identifier: int) -> None: - """Bypass the implementation. It's now the GNUTree which deals with - querying the gnu mirror. + def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all GNU projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None - As an implementation detail, we cannot change simply the base - SimpleLister as other implementation still uses it. This shall be part - of another refactoring pass. + artifacts = self.gnu_tree.artifacts - """ - return None - - def list_packages(self, response: Response) -> List[Dict[str, Any]]: - """List the actual gnu origins (package name) with their name, url and - associated tarballs. - - Args: - response: Unused - - Returns: - List of packages name, url, last modification time:: - - [ - { - 'name': '3dldf', - 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'time_modified': '2003-12-09T20:43:20+00:00' - }, - { - 'name': '8sync', - 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'time_modified': '2016-12-06T02:37:10+00:00' - }, - ... - ] + for project_name, project_info in page.items(): - """ - return list(self.gnu_tree.projects.values()) + origin_url = project_info["url"] + last_update = iso8601.parse_date(project_info["time_modified"]) - def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: - """Transform from repository representation to model + logger.debug("Found origin %s last updated on %s", origin_url, last_update) - """ - return { - "uid": repo["url"], - "name": repo["name"], - "full_name": repo["name"], - "html_url": repo["url"], - "origin_url": repo["url"], - "time_last_updated": repo["time_modified"], - "origin_type": "tar", - } + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="tar", + last_update=last_update, + extra_loader_arguments={"artifacts": artifacts[project_name]}, + ) diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py deleted file mode 100644 index db024f7..0000000 --- a/swh/lister/gnu/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, DateTime, String - -from ..core.models import ModelBase - - -class GNUModel(ModelBase): - """a GNU repository representation - - """ - - __tablename__ = "gnu_repo" - - uid = Column(String, primary_key=True) - time_last_updated = Column(DateTime) diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py index 3134582..8c90b36 100644 --- a/swh/lister/gnu/tasks.py +++ b/swh/lister/gnu/tasks.py @@ -1,18 +1,18 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task from .lister import GNULister @shared_task(name=__name__ + ".GNUListerTask") def list_gnu_full(**lister_args): """List lister for the GNU source code archive""" - return GNULister(**lister_args).run() + return GNULister.from_configfile(**lister_args).run().dict() @shared_task(name=__name__ + ".ping") def _ping(): return "OK" diff --git a/swh/lister/gnu/tests/api_response.json b/swh/lister/gnu/tests/api_response.json deleted file mode 100644 index 55e8204..0000000 --- a/swh/lister/gnu/tests/api_response.json +++ /dev/null @@ -1,37 +0,0 @@ -[{"type":"directory","name": ".","contents":[ - {"type":"file","name":".footer.shtml","size":444,"time":"1359994299"}, - {"type":"file","name":"find.txt.gz","size":261428,"time":"1557684608"}, - {"type":"directory","name":"gnu","size":12288,"time":"1556742017","contents":[]}, - {"type":"directory","name":"gnu+linux-distros","size":4096,"time":"1299783002","contents":[ - {"type":"directory","name":"ututo-e","size":4096,"time":"1487780066","contents":[ - {"type":"file","name":"README","size":48,"time":"1487780066"}, - {"type":"file","name":"index.html","size":158,"time":"1487780054"} - ]} - ]}, - {"type":"file","name":"ls-lrRt.txt.gz","size":480054,"time":"1557684607"}, - {"type":"directory","name":"mirrors","size":4096,"time":"1114010630","contents":[ - {"type":"directory","name":"dynebolic","size":4096,"time":"1317827602","contents":[ - {"type":"file","name":"MOVED_TO_mirror.fsf.org_dynebolic","size":0,"time":"1317826935"}, - {"type":"file","name":"index.html","size":107,"time":"1317827601"} - ]} - ]}, - {"type":"link","name":"non-gnu","target":"gnu/non-gnu","size":11,"time":"1082055542","contents":[]}, - {"type":"directory","name":"old-gnu","size":4096,"time":"1548360019","contents":[]}, - {"type":"link","name":"pub","target":".","size":1,"time":"1060090003","contents":[]}, - {"type":"directory","name":"savannah","size":4096,"time":"1194544006","contents":[ - {"type":"file","name":"README","size":473,"time":"1143758028"} - ]}, - {"type":"directory","name":"third-party","size":4096,"time":"1059825710","contents":[ - {"type":"file","name":"README","size":374,"time":"983824071"} - ]}, - {"type":"directory","name":"tmp","size":4096,"time":"1239072509","contents":[ - ]}, - {"type":"file","name":"tree.json.gz","size":0,"time":"1557684608"}, - {"type":"directory","name":"video","size":4096,"time":"1367963189","contents":[ - {"type":"file","name":".bash_history","size":27,"time":"1307027604"}, - {"type":"file","name":"stallmanupv.ogg.sig","size":536,"time":"1299776853"} - ]}, - {"type":"file","name":"welcome.msg","size":2830,"time":"1545163301"} -]}, -{"type":"report","directories":2743,"files":63983} -] diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py deleted file mode 100644 index 23cd215..0000000 --- a/swh/lister/gnu/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "gnu" - - -@pytest.fixture -def lister_gnu(swh_lister): - for task_type in [ - { - "type": "load-archive-files", - "description": "Load archive repository", - "backend_name": "swh.loader.packages.tasks.LoadArchive", - "default_interval": "1 day", - }, - ]: - swh_lister.scheduler.create_task_type(task_type) - - return swh_lister diff --git a/swh/lister/gnu/tests/find_tarballs_output.json b/swh/lister/gnu/tests/find_tarballs_output.json deleted file mode 100644 index f1ccd6c..0000000 --- a/swh/lister/gnu/tests/find_tarballs_output.json +++ /dev/null @@ -1,182 +0,0 @@ -[ - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.bz2", - "date": "1495205979" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.gz", - "date": "1495205967" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.12-f39e-dirty.tar.gz", - "date": "1494994222" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.bz2", - "date": "1520284021" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.gz", - "date": "1520284007" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.bz2", - "date": "1521742071" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.gz", - "date": "1521742057" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.bz2", - "date": "1525717261" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.gz", - "date": "1525717246" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.bz2", - "date": "1546205569" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.gz", - "date": "1546205555" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.bz2", - "date": "1546205025" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.gz", - "date": "1546205012" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_0-src.zip", - "date": "898422900" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_2-src.zip", - "date": "920018269" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_3-src.zip", - "date": "936750503" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_4-src.tar.gz", - "date": "944290190" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_5-src.tar.gz", - "date": "944600462" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_6-src.tar.gz", - "date": "952156231" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_7-src.tar.gz", - "date": "952313061" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_1_0-src.tar.gz", - "date": "969299378" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_0beta-src.tar.gz", - "date": "977027031" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_1-src.tar.gz", - "date": "981323331" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_2-src.tar.gz", - "date": "981570576" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_3-src.tar.gz", - "date": "982656672" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_4-src.tar.gz", - "date": "1007952574" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_5-src.tar.gz", - "date": "1008502483" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_6-src.tar.gz", - "date": "1012641285" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-3.6.2.tar.gz", - "date": "869814000" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.0.tar.gz", - "date": "898422900" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.2.tar.gz", - "date": "920018202" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.3.tar.gz", - "date": "936750512" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.4.tar.gz", - "date": "944290148" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.5.tar.gz", - "date": "944599461" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.6.tar.gz", - "date": "952156235" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.7.tar.gz", - "date": "952313085" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.1.0.tar.gz", - "date": "969299287" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.0beta.tar.gz", - "date": "977027108" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.1.tar.gz", - "date": "981323501" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.2.tar.gz", - "date": "981562809" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.3.tar.gz", - "date": "982657006" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.4.tar.gz", - "date": "1007952745" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.5.tar.gz", - "date": "1008466945" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.6.tar.gz", - "date": "1012641715" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.7.tar.gz", - "date": "1070057764" - } - ] \ No newline at end of file diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py index e17365a..8cbb3b1 100644 --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -1,47 +1,36 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging +from ..lister import GNULister -logger = logging.getLogger(__name__) +def test_gnu_lister(swh_scheduler, requests_mock_datadir): + lister = GNULister(scheduler=swh_scheduler) -def test_gnu_lister(lister_gnu, requests_mock_datadir): - lister_gnu.run() + stats = lister.run() - r = lister_gnu.scheduler.search_tasks(task_type="load-archive-files") - assert len(r) == 383 + assert stats.pages == 1 + assert stats.origins == 383 - for row in r: - assert row["type"] == "load-archive-files" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - # kwargs - kwargs = row["arguments"]["kwargs"] - assert set(kwargs.keys()) == {"url", "artifacts"} + assert len(scheduler_origins) == stats.origins - url = kwargs["url"] - assert url.startswith("https://ftp.gnu.org") + for origin in scheduler_origins: + assert origin.url.startswith(GNULister.GNU_FTP_URL) + assert origin.last_update is not None + assert "artifacts" in origin.extra_loader_arguments + assert len(origin.extra_loader_arguments["artifacts"]) > 0 - url_suffix = url.split("https://ftp.gnu.org")[1] - assert "gnu" in url_suffix or "old-gnu" in url_suffix - artifacts = kwargs["artifacts"] - # check the artifact's structure - artifact = artifacts[0] - assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"} - - for artifact in artifacts: - logger.debug(artifact) - # 'time' is an isoformat string now - for key in ["url", "time", "filename", "version"]: - assert isinstance(artifact[key], str) - assert isinstance(artifact["length"], int) - - assert row["policy"] == "oneshot" - assert row["priority"] is None - assert row["retries_left"] == 3 +def test_gnu_lister_from_configfile(swh_scheduler_config, mocker): + load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") + load_from_envvar.return_value = { + "scheduler": {"cls": "local", **swh_scheduler_config}, + "credentials": {}, + } + lister = GNULister.from_configfile() + assert lister.scheduler is not None + assert lister.credentials is not None diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py index d91f86a..464d622 100644 --- a/swh/lister/gnu/tests/test_tasks.py +++ b/swh/lister/gnu/tests/test_tasks.py @@ -1,30 +1,30 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +from swh.lister.pattern import ListerStats def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.gnu.tasks.ping") assert res res.wait() assert res.successful() assert res.result == "OK" -@patch("swh.lister.gnu.tasks.GNULister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked GNULister - lister.return_value = lister - lister.run.return_value = None +def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.gnu.tasks.GNULister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=300) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task("swh.lister.gnu.tasks.GNUListerTask") assert res res.wait() assert res.successful() + assert res.result == stats.dict() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with() diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 4f541fb..1a6b3e3 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -1,277 +1,277 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar from swh.core.config import load_from_envvar from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface @dataclass class ListerStats: pages: int = 0 origins: int = 0 def __add__(self, other: "ListerStats") -> "ListerStats": return self.__class__(self.pages + other.pages, self.origins + other.origins) def __iadd__(self, other: "ListerStats"): self.pages += other.pages self.origins += other.origins def dict(self) -> Dict[str, int]: return {"pages": self.pages, "origins": self.origins} StateType = TypeVar("StateType") PageType = TypeVar("PageType") BackendStateType = Dict[str, Any] CredentialsType = Optional[Dict[str, Dict[str, List[Dict[str, str]]]]] class Lister(Generic[StateType, PageType]): """The base class for a Software Heritage lister. A lister scrapes a page by page list of origins from an upstream (a forge, the API of a package manager, ...), and massages the results of that scrape into a list of origins that are recorded by the scheduler backend. The main loop of the lister, :meth:`run`, basically revolves around the :meth:`get_pages` iterator, which sets up the lister state, then yields the scrape results page by page. The :meth:`get_origins_from_page` method converts the pages into a list of :class:`model.ListedOrigin`, sent to the scheduler at every page. The :meth:`commit_page` method can be used to update the lister state after a page of origins has been recorded in the scheduler backend. The :func:`finalize` method is called at lister teardown (whether the run has been successful or not) to update the local :attr:`state` object before it's sent to the database. This method must set the :attr:`updated` attribute if an updated state needs to be sent to the scheduler backend. This method can call :func:`get_state_from_scheduler` to refresh and merge the lister state from the scheduler before it's finalized (and potentially minimize the risk of race conditions between concurrent runs of the lister). The state of the lister is serialized and deserialized from the dict stored in the scheduler backend, using the :meth:`state_from_dict` and :meth:`state_to_dict` methods. Args: scheduler: the instance of the Scheduler being used to register the origins listed by this lister url: a URL representing this lister, e.g. the API's base URL instance: the instance name used, in conjunction with :attr:`LISTER_NAME`, to uniquely identify this lister instance. credentials: dictionary of credentials for all listers. The first level identifies the :attr:`LISTER_NAME`, the second level the lister :attr:`instance`. The final level is a list of dicts containing the expected credentials for the given instance of that lister. Generic types: - *StateType*: concrete lister type; should usually be a :class:`dataclass` for stricter typing - *PageType*: type of scrape results; can usually be a :class:`requests.Response`, or a :class:`dict` """ LISTER_NAME: str = "" def __init__( self, scheduler: SchedulerInterface, url: str, instance: str, credentials: CredentialsType = None, ): if not self.LISTER_NAME: raise ValueError("Must set the LISTER_NAME attribute on Lister classes") self.url = url self.instance = instance self.scheduler = scheduler if not credentials: credentials = {} self.credentials = list( credentials.get(self.LISTER_NAME, {}).get(self.instance, []) ) # store the initial state of the lister self.state = self.get_state_from_scheduler() self.updated = False def run(self) -> ListerStats: """Run the lister. Returns: A counter with the number of pages and origins seen for this run of the lister. """ full_stats = ListerStats() try: for page in self.get_pages(): full_stats.pages += 1 origins = self.get_origins_from_page(page) full_stats.origins += self.send_origins(origins) self.commit_page(page) finally: self.finalize() if self.updated: self.set_state_in_scheduler() return full_stats def get_state_from_scheduler(self) -> StateType: """Update the state in the current instance from the state in the scheduler backend. This updates :attr:`lister_obj`, and returns its (deserialized) current state, to allow for comparison with the local state. Returns: the state retrieved from the scheduler backend """ self.lister_obj = self.scheduler.get_or_create_lister( name=self.LISTER_NAME, instance_name=self.instance ) return self.state_from_dict(self.lister_obj.current_state) def set_state_in_scheduler(self) -> None: """Update the state in the scheduler backend from the state of the current instance. Raises: :class:`swh.scheduler.exc.StaleData` in case of a race condition between concurrent listers (from :meth:`swh.scheduler.Scheduler.update_lister`). """ self.lister_obj.current_state = self.state_to_dict(self.state) self.lister_obj = self.scheduler.update_lister(self.lister_obj) # State management to/from the scheduler def state_from_dict(self, d: BackendStateType) -> StateType: """Convert the state stored in the scheduler backend (as a dict), to the concrete StateType for this lister.""" raise NotImplementedError def state_to_dict(self, state: StateType) -> BackendStateType: """Convert the StateType for this lister to its serialization as dict for storage in the scheduler. Values must be JSON-compatible as that's what the backend database expects. """ raise NotImplementedError def finalize(self) -> None: """Custom hook to finalize the lister state before returning from the main loop. This method must set :attr:`updated` if the lister has done some work. If relevant, this method can use :meth`get_state_from_scheduler` to merge the current lister state with the one from the scheduler backend, reducing the risk of race conditions if we're running concurrent listings. This method is called in a `finally` block, which means it will also run when the lister fails. """ pass # Actual listing logic def get_pages(self) -> Iterator[PageType]: """Retrieve a list of pages of listed results. This is the main loop of the lister. Returns: an iterator of raw pages fetched from the platform currently being listed. """ raise NotImplementedError def get_origins_from_page(self, page: PageType) -> Iterator[model.ListedOrigin]: """Extract a list of :class:`model.ListedOrigin` from a raw page of results. Args: page: a single page of results Returns: an iterator for the origins present on the given page of results """ raise NotImplementedError def commit_page(self, page: PageType) -> None: """Custom hook called after the current page has been committed in the scheduler backend. This method can be used to update the state after a page of origins has been successfully recorded in the scheduler backend. If the new state should be recorded at the point the lister completes, the :attr:`updated` attribute must be set. """ pass def send_origins(self, origins: Iterable[model.ListedOrigin]) -> int: """Record a list of :class:`model.ListedOrigin` in the scheduler. Returns: the number of listed origins recorded in the scheduler """ count = 0 - for batch_origins in grouper(origins, n=100): + for batch_origins in grouper(origins, n=1000): ret = self.scheduler.record_listed_origins(batch_origins) count += len(ret) return count @classmethod def from_config(cls, scheduler: Dict[str, Any], **config: Any): """Instantiate a lister from a configuration dict. This is basically a backwards-compatibility shim for the CLI. Args: scheduler: instantiation config for the scheduler config: the configuration dict for the lister, with the following keys: - credentials (optional): credentials list for the scheduler - any other kwargs passed to the lister. Returns: the instantiated lister """ # Drop the legacy config keys which aren't used for this generation of listers. for legacy_key in ("storage", "lister", "celery"): config.pop(legacy_key, None) # Instantiate the scheduler scheduler_instance = get_scheduler(**scheduler) return cls(scheduler=scheduler_instance, **config) @classmethod def from_configfile(cls, **kwargs: Any): """Instantiate a lister from the configuration loaded from the SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their value is not None. Args: kwargs: kwargs passed to the lister instantiation """ config = dict(load_from_envvar()) config.update({k: v for k, v in kwargs.items() if v is not None}) return cls.from_config(**config) class StatelessLister(Lister[None, PageType], Generic[PageType]): def state_from_dict(self, d: BackendStateType) -> None: """Always return empty state""" return None def state_to_dict(self, state: None) -> BackendStateType: """Always set empty state""" return {}