diff --git a/PKG-INFO b/PKG-INFO index edc8cf0..5148449 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,127 +1,127 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.7.0 +Version: 2.7.1 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index edc8cf0..5148449 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,127 +1,127 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.7.0 +Version: 2.7.1 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index 5b60da7..8078175 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,202 +1,208 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import logging from typing import Any, Dict, Iterator, Optional, Tuple import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.errors import RestfulError from lazr.restfulclient.resource import Collection +from tenacity.before_sleep import before_sleep_log from swh.lister.utils import retry_if_exception, throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) VcsType = str LaunchpadPageType = Tuple[VcsType, Collection] SUPPORTED_VCS_TYPES = ("git", "bzr") @dataclass class LaunchpadListerState: """State of Launchpad lister""" git_date_last_modified: Optional[datetime] = None """modification date of last updated git repository since last listing""" bzr_date_last_modified: Optional[datetime] = None """modification date of last updated bzr repository since last listing""" def origin(vcs_type: str, repo: Any) -> str: """Determine the origin url out of a repository with a given vcs_type""" return repo.git_https_url if vcs_type == "git" else repo.web_link def retry_if_restful_error(retry_state): return retry_if_exception(retry_state, lambda e: isinstance(e, RestfulError)) class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): """ List repositories from Launchpad (git or bzr). Args: scheduler: instance of SchedulerInterface incremental: defines if incremental listing should be used, in that case only modified or new repositories since last incremental listing operation will be returned """ LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, ) self.incremental = incremental self.date_last_modified: Dict[str, Optional[datetime]] = { "git": None, "bzr": None, } def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: for vcs_type in SUPPORTED_VCS_TYPES: key = f"{vcs_type}_date_last_modified" date_last_modified = d.get(key) if date_last_modified is not None: d[key] = iso8601.parse_date(date_last_modified) return LaunchpadListerState(**d) def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {} for vcs_type in SUPPORTED_VCS_TYPES: attribute_name = f"{vcs_type}_date_last_modified" d[attribute_name] = None if hasattr(state, attribute_name): date_last_modified = getattr(state, attribute_name) if date_last_modified is not None: d[attribute_name] = date_last_modified.isoformat() return d - @throttling_retry(retry=retry_if_restful_error) + @throttling_retry( + retry=retry_if_restful_error, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) def _page_request( self, launchpad, vcs_type: str, date_last_modified: Optional[datetime] ) -> Optional[Collection]: """Querying the page of results for a given vcs_type since the date_last_modified. If some issues occurs, this will deal with the retrying policy. """ get_vcs_fns = { "git": launchpad.git_repositories.getRepositories, "bzr": launchpad.branches.getBranches, } return get_vcs_fns[vcs_type]( order_by="most neglected first", modified_since_date=date_last_modified, ) def get_pages(self) -> Iterator[LaunchpadPageType]: """ Yields an iterator on all git/bzr repositories hosted on Launchpad sorted by last modification date in ascending order. """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) if self.incremental: self.date_last_modified = { "git": self.state.git_date_last_modified, "bzr": self.state.bzr_date_last_modified, } for vcs_type in SUPPORTED_VCS_TYPES: try: result = self._page_request( launchpad, vcs_type, self.date_last_modified[vcs_type] ) except RestfulError as e: logger.warning("Listing %s origins raised %s", vcs_type, e) result = None if not result: continue yield vcs_type, result - @throttling_retry(retry=retry_if_restful_error) def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None vcs_type, repos = page - for repo in repos: - origin_url = origin(vcs_type, repo) + try: + for repo in repos: + origin_url = origin(vcs_type, repo) - # filter out origins with invalid URL - if not origin_url.startswith("https://"): - continue + # filter out origins with invalid URL + if not origin_url.startswith("https://"): + continue - last_update = repo.date_last_modified + last_update = repo.date_last_modified - self.date_last_modified[vcs_type] = last_update + self.date_last_modified[vcs_type] = last_update - logger.debug( - "Found origin %s with type %s last updated on %s", - origin_url, - vcs_type, - last_update, - ) + logger.debug( + "Found origin %s with type %s last updated on %s", + origin_url, + vcs_type, + last_update, + ) - yield ListedOrigin( - lister_id=self.lister_obj.id, - visit_type=vcs_type, - url=origin_url, - last_update=last_update, - ) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=vcs_type, + url=origin_url, + last_update=last_update, + ) + except RestfulError as e: + logger.warning("Listing %s origins raised %s", vcs_type, e) def finalize(self) -> None: git_date_last_modified = self.date_last_modified["git"] bzr_date_last_modified = self.date_last_modified["bzr"] if git_date_last_modified is None and bzr_date_last_modified is None: return if self.incremental and ( self.state.git_date_last_modified is None or ( git_date_last_modified is not None and git_date_last_modified > self.state.git_date_last_modified ) ): self.state.git_date_last_modified = git_date_last_modified if self.incremental and ( self.state.bzr_date_last_modified is None or ( bzr_date_last_modified is not None and bzr_date_last_modified > self.state.bzr_date_last_modified ) ): self.state.bzr_date_last_modified = self.date_last_modified["bzr"] self.updated = True