diff --git a/PKG-INFO b/PKG-INFO index 6687f51..4c7c008 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,127 +1,123 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.9.2 +Version: 2.9.3 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. - - diff --git a/conftest.py b/conftest.py index da8b930..00eb31a 100644 --- a/conftest.py +++ b/conftest.py @@ -1,10 +1,10 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -pytest_plugins = ["swh.scheduler.pytest_plugin"] +pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"] os.environ["LC_ALL"] = "C.UTF-8" diff --git a/debian/changelog b/debian/changelog index 29e4622..37d0cb1 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,1187 +1,1192 @@ -swh-lister (2.9.2-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-lister (2.9.3-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 2.9.3 - (tagged by Antoine R. Dumont + (@ardumont) on 2022-05-23 15:39:15 + +0200) + * Upstream changes: - v2.9.3 - Adapt maven lister to list + canonical gh urls if any - Use swh.core.github.pytest_plugin in + github tests - -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 May 2022 08:31:14 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 May 2022 13:47:34 +0000 swh-lister (2.9.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.9.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-10 10:22:12 +0200) * Upstream changes: - v2.9.2 - maven: Prevent UnicodeDecodeError when processing pom file -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 May 2022 08:27:22 +0000 swh-lister (2.9.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.9.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-29 14:45:18 +0200) * Upstream changes: - v2.9.1 - crates: Create one origin per package instead of per version - maven: Handle null mtime value in index for jar archive - maven: Remove extraction of groupId and artifactId from pom files - maven: Create one origin per package instead of one per package version - Bump mypy to v0.942 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Apr 2022 12:50:29 +0000 swh-lister (2.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.9.0 - (tagged by Valentin Lorentz on 2022-04-26 11:28:55 +0200) * Upstream changes: - v2.9.0 - * github: Remove dead code - * github: Refactor rate-limiting out of the GitHubLister class - * maven: Remove duplicated code related to setting instance from netloc -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 Apr 2022 09:34:54 +0000 swh-lister (2.8.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.8.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-25 12:34:14 +0200) * Upstream changes: - v2.8.2 - sourceforge: Fix listing of bzr projects - sourceforge: Do not consider Attic as a valid CVS module -- Software Heritage autobuilder (on jenkins-debian1) Mon, 25 Apr 2022 10:39:18 +0000 swh-lister (2.8.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.8.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-14 15:56:17 +0200) * Upstream changes: - v2.8.1 - maven: Fix argument of type 'NoneType' is not iterable -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 14:01:42 +0000 swh-lister (2.8.0-1~swh2) unstable-swh; urgency=medium * Bump new release (fix build dep) -- Antoine R. Dumont (@ardumont) Thu, 14 Apr 2022 14:51:05 +0200 swh-lister (2.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-14 11:42:16 +0200) * Upstream changes: - v2.8.0 - lister: Add new rust crates lister - maven: Continue listing if unable to retrieve pom information - maven: log error message when not able to retrieve the index to read -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 09:50:25 +0000 swh-lister (2.7.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.7.2 - (tagged by Antoine Lambert on 2022-03-11 13:34:15 +0100) * Upstream changes: - version 2.7.2 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 11 Mar 2022 12:38:38 +0000 swh-lister (2.7.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.7.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-18 10:42:52 +0100) * Upstream changes: - v2.7.1 - launchpad: Ignore erratic page and continue listing next page -- Software Heritage autobuilder (on jenkins-debian1) Fri, 18 Feb 2022 09:46:37 +0000 swh-lister (2.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-17 13:56:23 +0100) * Upstream changes: - v2.7.0 - launchpad: Allow bzr origins listing - launchpad: Manage unhandled exceptions when listing - sourceforge: Fix origin URLs for CVS projects -- Software Heritage autobuilder (on jenkins-debian1) Thu, 17 Feb 2022 13:02:22 +0000 swh-lister (2.6.4-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.4 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-14 16:57:38 +0100) * Upstream changes: - v2.6.4 - sourceforge: fix support for listing bzr origins -- Software Heritage autobuilder (on jenkins-debian1) Mon, 14 Feb 2022 16:01:23 +0000 swh-lister (2.6.3-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-09 17:20:28 +0100) * Upstream changes: - v2.6.3 - maven: Fix last update datetime -- Software Heritage autobuilder (on jenkins-debian1) Wed, 09 Feb 2022 16:24:11 +0000 swh-lister (2.6.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-08 10:39:05 +0100) * Upstream changes: - v2.6.2 - Remove no longer needed tenacity workarounds - maven: Fix undef last_update in ListedOrigins. - maven: dismiss origins if they are malformed - e.g. wrong pom scm format, add test. - maven: Let logging instruction do the formatting - maven: Add more debug logging instruction - maven: Pass the base URL of the Maven instance to the loader - docs: Fix ReST syntax and sphinx warnings - Pin mypy and drop type annotations which makes mypy unhappy - requirements-test: Pin pytest to < 7.0.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Feb 2022 09:43:37 +0000 swh-lister (2.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.1 - (tagged by Antoine Lambert on 2021-12-06 10:47:19 +0100) * Upstream changes: - version 2.6.1 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Dec 2021 09:51:07 +0000 swh-lister (2.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.0 - (tagged by Antoine Lambert on 2021-12-03 16:17:52 +0100) * Upstream changes: - version 2.6.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 15:22:00 +0000 swh-lister (2.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.0 - (tagged by Antoine Lambert on 2021-12-03 14:44:36 +0100) * Upstream changes: - version 2.5.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 13:48:49 +0000 swh-lister (2.4.0-1~swh3) unstable-swh; urgency=medium * Fix changelog error and actual correct release -- Antoine R. Dumont (@ardumont) Fri, 03 Dec 2021 12:45:00 +0100 swh.lister (2.4.0-1~swh2) unstable-swh; urgency=medium * Update missing deps and release -- Antoine R. Dumont (@ardumont) Fri, 03 Dec 2021 12:37:13 +0100 swh-lister (2.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-03 12:17:36 +0100) * Upstream changes: - v2.4.0 - debian: Update extra_loader_arguments dict produced ListedOrigin models - debian: Add missing file URIs in lister output - Deduplicate origins in the GitHub lister - lister: Add new maven lister -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 11:21:58 +0000 swh-lister (2.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.3.0 - (tagged by Valentin Lorentz on 2021-11-10 13:44:49 +0100) * Upstream changes: - v2.3.0 - * cran: Pass the package name to the loader -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Nov 2021 13:03:02 +0000 swh-lister (2.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.0 - (tagged by Antoine Lambert on 2021-10-22 15:16:48 +0200) * Upstream changes: - version 2.2.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Oct 2021 13:23:02 +0000 swh-lister (2.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-10-13 10:16:37 +0200) * Upstream changes: - v2.1.0 - Let sourceforge origins be listed "enabled" by default - docs: Add a save forge documentation - docs: Explain task type registering to complete the save forge doc -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Oct 2021 08:21:42 +0000 swh-lister (2.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-29 09:21:37 +0200) * Upstream changes: - v2.0.0 - opam: Share opam root directory even on multiple instances -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 Sep 2021 07:31:03 +0000 swh-lister (1.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-21 11:23:23 +0200) * Upstream changes: - v1.9.0 - gnu: Respect the pattern docstring about state initialization - opam: Allow defining where to actually install the opam_root folder - opam: Make the instance optional and derived from the url - opam: Move the state initialization into the get_pages method -- Software Heritage autobuilder (on jenkins-debian1) Tue, 21 Sep 2021 09:29:04 +0000 swh-lister (1.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-17 15:44:00 +0200) * Upstream changes: - v1.8.0 - Allow gitlab lister's name to be overridden by task arguments -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 13:47:58 +0000 swh-lister (1.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-17 13:37:22 +0200) * Upstream changes: - v1.7.0 - gitlab: Allow ingestion of hg_git origins as hg ones (some instance can list tose e.g - foss.heptapod.net) -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 11:41:52 +0000 swh-lister (1.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.6.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-17 10:50:28 +0200) * Upstream changes: - v1.6.0 - gitlab: Allow listing of instances providing multiple vcs_type -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 08:55:14 +0000 swh-lister (1.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-07-23 16:28:50 +0200) * Upstream changes: - v1.5.0 - gitlab: Handle HTTP status code 500 when listing projects - gitlab: Update requests query parameters - gitlab: Adapt requests retry policy to consider HTTP 50x status codes - opam: Directly use the --root flag instead of using an env variable - pattern: Use URL network location as instance name when not provided -- Software Heritage autobuilder (on jenkins-debian1) Fri, 23 Jul 2021 14:32:51 +0000 swh-lister (1.4.0-1~swh2) unstable-swh; urgency=medium * Bump new release -- Antoine R. Dumont (@ardumont) Fri, 09 Jul 2021 13:17:00 +0200 swh-lister (1.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-07-09 13:01:04 +0200) * Upstream changes: - v1.4.0 - New Tuleap lister - New Opam lister - Make PyPI lister incremental - Make PyPI lister complete the information on origins -- Software Heritage autobuilder (on jenkins-debian1) Fri, 09 Jul 2021 11:06:37 +0000 swh-lister (1.3.6-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.6 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-04 11:59:24 +0200) * Upstream changes: - v1.3.6 - sourceforge: use http:// for Mercurial (as workaround) -- Software Heritage autobuilder (on jenkins-debian1) Fri, 04 Jun 2021 10:03:14 +0000 swh-lister (1.3.5-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.5 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-03 10:22:17 +0200) * Upstream changes: - v1.3.5 - sourceforge: set the protocol for origin urls -- Software Heritage autobuilder (on jenkins-debian1) Thu, 03 Jun 2021 08:26:13 +0000 swh-lister (1.3.4-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-31 16:54:37 +0200) * Upstream changes: - v1.3.4 - Disable the sourceforge lister origins (so they can be listed) -- Software Heritage autobuilder (on jenkins-debian1) Mon, 31 May 2021 15:08:17 +0000 swh-lister (1.3.3-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.3 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-28 14:18:53 +0200) * Upstream changes: - v1.3.3 - cgit/lister: Fix error when a missing version is not provided -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 May 2021 12:39:52 +0000 swh-lister (1.3.2-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.2 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-26 12:43:45 +0200) * Upstream changes: - v1.3.2 - sourceforge: retry for all retryable exceptions -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 May 2021 10:48:22 +0000 swh-lister (1.3.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-19 11:25:59 +0200) * Upstream changes: - v1.3.1 - sourceforge: don't abort on error for project -- Software Heritage autobuilder (on jenkins-debian1) Wed, 19 May 2021 09:30:14 +0000 swh-lister (1.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-07 17:17:50 +0200) * Upstream changes: - v1.3.0 - sourceforge/tasks: Allow incremental listing - sourceforge/lister: Add credentials parameter -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 15:24:27 +0000 swh-lister (1.2.2-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.2 - (tagged by Antoine Lambert on 2021-05-07 14:43:24 +0200) * Upstream changes: - version 1.2.2 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 12:50:12 +0000 swh-lister (1.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.1 - (tagged by Antoine Lambert on 2021-05-07 14:10:36 +0200) * Upstream changes: - version 1.2.1 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 12:17:16 +0000 swh-lister (1.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-06 15:17:51 +0200) * Upstream changes: - v1.2.0 - Make the SourceForge lister incremental -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 10:43:11 +0000 swh-lister (1.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.1.0 - (tagged by Antoine Lambert on 2021-04-29 14:29:27 +0200) * Upstream changes: - version 1.1.0 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 29 Apr 2021 12:33:59 +0000 swh-lister (1.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.0 - (tagged by Nicolas Dandrimont on 2021-03-22 10:56:04 +0100) * Upstream changes: - Release swh.lister v1.0.0 - All listers have been rewritten and are ready to be used in production - with the most recent version of the swh.scheduler APIs. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Mar 2021 10:13:35 +0000 swh-lister (0.10.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.10.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-03-01 09:59:16 +0100) * Upstream changes: - v0.10.0 - docs: Add new "howto write a lister tutorial" with unified lister api -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Mar 2021 09:01:54 +0000 swh-lister (0.9.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-08 14:09:27 +0100) * Upstream changes: - v0.9.1 - debian: Update archive mirror URL templates to process -- Software Heritage autobuilder (on jenkins-debian1) Mon, 08 Feb 2021 13:12:05 +0000 swh-lister (0.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-08 08:50:07 +0100) * Upstream changes: - v0.9.0 - docs: Update listers execution instructions - cran: Prevent multiple listing of an origin - cran: Add support for parsing date with milliseconds - pypi: Use BeautifulSoup for parsing HTML instead of xmltodict -- Software Heritage autobuilder (on jenkins-debian1) Mon, 08 Feb 2021 07:52:57 +0000 swh-lister (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-03 11:12:52 +0100) * Upstream changes: - v0.8.0 - packagist: Reimplement lister using new Lister API - gnu: Remove dependency on pytz - Remove no longer used models field in dict returned by register - Remove no longer used legacy Lister API and update CLI options -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Feb 2021 10:15:54 +0000 swh-lister (0.7.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.1 - (tagged by Vincent SELLIER on 2021-02-01 17:52:33 +0100) * Upstream changes: - v0.7.1 - * cgit: remove the repository urls's trailing / -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Feb 2021 16:56:35 +0000 swh-lister (0.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-01 09:31:30 +0100) * Upstream changes: - v0.7.0 - pattern: Bump packet split to chunk of 1000 records - cgit: Compute origin urls out of a base git url when provided. - gnu: Reimplement lister using new Lister API -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Feb 2021 08:35:14 +0000 swh-lister (0.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-29 09:07:21 +0100) * Upstream changes: - v0.6.1 - launchpad: Remove call to dataclasses.asdict on lister state - launchpad: Prevent error due to origin listed twice - Make debian lister constructors compatible with credentials - launchpad/tasks: Fix ping task function name - pattern: Make lister flush regularly origins to scheduler -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Jan 2021 08:11:13 +0000 swh-lister (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-28 15:48:32 +0100) * Upstream changes: - v0.6.0 - launchpad: Reimplement lister using new Lister API - Make stateless lister constructors compatible with credentials -- Software Heritage autobuilder (on jenkins-debian1) Thu, 28 Jan 2021 14:52:49 +0000 swh-lister (0.5.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-28 11:23:29 +0100) * Upstream changes: - v0.5.4 - gitlab: Deal with missing or trailing / in url input - tox.ini: Work around build failure due to upstream release -- Software Heritage autobuilder (on jenkins-debian1) Thu, 28 Jan 2021 10:27:59 +0000 swh-lister (0.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.2 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-27 17:19:10 +0100) * Upstream changes: - v0.5.2 - test_cli: Drop launchpad lister from the test_get_lister -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Jan 2021 16:25:31 +0000 swh-lister (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-27 16:39:20 +0100) * Upstream changes: - v0.5.1 - launchpad: Actually mock the anonymous login to launchpad - Drop no longer swh.lister.core.{indexing,page_by_page}_lister - tests: Drop unneeded reset instruction - cgit: Don't stop the listing when a repository page is not available -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Jan 2021 15:47:39 +0000 swh-lister (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-27 14:33:24 +0100) * Upstream changes: - v0.5.0 - cgit: Add support for last_update information during listing - Port Debian lister to new lister api - gitlab: Implement keyset-based pagination listing - cran: Retrieve last update date for each listed package - Port CRAN lister to new lister api - gitlab: Add support for last_update information during listing - Port Gitea lister to new lister api - Port cgit lister to the new lister api - bitbucket: Pick random credentials in configuration and improve logging - Port Gitlab lister to the new lister api - Port Npm lister to new lister api - Port PyPI lister to new lister api - Port Bitbucket lister to new lister api - Port Phabricator lister to new lister api - Port GitHub lister to new lister api - Introduce a simpler base pattern for lister implementations -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Jan 2021 13:40:34 +0000 swh-lister (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-23 15:47:05 +0100) * Upstream changes: - v0.4.0 - requirements: Rework dependencies - tests: Reduce db initialization fixtures to a minimum - Create listing task with a default of 3 if unspecified - lister.pytest_plugin: Simplify fixture setup - tests: Clarify listers test configuration -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Nov 2020 14:52:03 +0000 swh-lister (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-19 09:50:43 +0200) * Upstream changes: - v0.3.0 - lister.config: Adapt scheduler configuration structure - drop mock_get_scheduler which creates indirection for no good reason -- Software Heritage autobuilder (on jenkins-debian1) Mon, 19 Oct 2020 07:56:17 +0000 swh-lister (0.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-07 14:02:42 +0200) * Upstream changes: - v0.2.1 - lister_base: Drop leftover mixin SWHConfig which is no longer used -- Software Heritage autobuilder (on jenkins-debian1) Wed, 07 Oct 2020 12:07:43 +0000 swh-lister (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-06 09:33:33 +0200) * Upstream changes: - v0.2.0 - lister*: Migrate away from SWHConfig mixin - tox.ini: pin black to the pre-commit version (19.10b0) to avoid flip-flops - Run isort after the CLI import changes -- Software Heritage autobuilder (on jenkins-debian1) Tue, 06 Oct 2020 07:36:07 +0000 swh-lister (0.1.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.5 - (tagged by David Douard on 2020-09-25 11:51:57 +0200) * Upstream changes: - v0.1.5 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Sep 2020 09:55:44 +0000 swh-lister (0.1.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.4 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-10 11:32:46 +0200) * Upstream changes: - v0.1.4 - gitea.lister: Fix uid to be unique across instance - utils.split_range: Split into not overlapping ranges - gitea.tasks: Fix parameter name from 'sort' to 'order' -- Software Heritage autobuilder (on jenkins-debian1) Thu, 10 Sep 2020 09:35:53 +0000 swh-lister (0.1.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.3 - (tagged by Vincent SELLIER on 2020-09-08 14:48:08 +0200) * Upstream changes: - v0.1.3 - Launchpad: rename task name to match conventions - tests: Separate lister instantiations -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Sep 2020 12:53:22 +0000 swh-lister (0.1.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-02 13:07:30 +0200) * Upstream changes: - v0.1.2 - pytest_plugin: Instantiate only lister with no particular setup - pytest: Define plugin and declare it in the root conftest -- Software Heritage autobuilder (on jenkins-debian1) Wed, 02 Sep 2020 11:10:14 +0000 swh-lister (0.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-01 16:08:48 +0200) * Upstream changes: - v0.1.1 - test_cli: Exclude launchpad lister from the check -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Sep 2020 14:11:46 +0000 swh-lister (0.1.0-1~swh2) unstable-swh; urgency=medium * Update dependencies -- Antoine R. Dumont (@ardumont) Wed, 26 Aug 2020 16:05:03 +0000 swh-lister (0.1.0-1~swh1) unstable-swh; urgency=medium [ Nicolas Dandrimont ] * Use setuptools-scm instead of vcversioner [ Software Heritage autobuilder (on jenkins-debian1) ] * New upstream release 0.1.0 - (tagged by David Douard on 2020-08-25 18:33:55 +0200) * Upstream changes: - v0.1.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 25 Aug 2020 16:39:28 +0000 swh-lister (0.0.50-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.50 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-20 10:44:57 +0100) * Upstream changes: - v0.0.50 - github.lister: Filter out partial repositories which break listing - docs: Fix sphinx warnings - core.lister_base: Improve slightly docs and types -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 Jan 2020 09:51:23 +0000 swh-lister (0.0.49-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.49 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-17 14:20:35 +0100) * Upstream changes: - v0.0.49 - github.lister: Use Retry-After header when rate limit reached -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Jan 2020 13:27:56 +0000 swh-lister (0.0.48-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.48 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-16 13:56:12 +0100) * Upstream changes: - v0.0.48 - cran.lister: Use cran's canonical url for origin url - cran.lister: Version uid so we can list new package versions - cran.lister: Adapt docstring sample accordingly -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jan 2020 13:03:54 +0000 swh-lister (0.0.47-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.47 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-09 10:26:18 +0100) * Upstream changes: - v0.0.47 - cran.lister: Align loading tasks' with loader's expectation -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jan 2020 09:34:26 +0000 swh-lister (0.0.46-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.46 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-19 14:09:45 +0100) * Upstream changes: - v0.0.46 - lister.debian: Make debian init step idempotent and up-to-date - lister_base: Split into chunks the tasks prior to creation -- Software Heritage autobuilder (on jenkins-debian1) Thu, 19 Dec 2019 13:16:45 +0000 swh-lister (0.0.45-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.45 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-10 11:27:17 +0100) * Upstream changes: - v0.0.45 - core: Align listers' task output (hg/git tasks) with expected format - npm: Align lister's loader output tasks with expected format - lister/tasks: Standardize return statements -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 Dec 2019 10:32:45 +0000 swh-lister (0.0.44-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.44 - (tagged by Nicolas Dandrimont on 2019-11-22 16:15:54 +0100) * Upstream changes: - Release swh.lister v0.0.44 - Define proper User Agents everywhere -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 15:31:33 +0000 swh-lister (0.0.43-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.43 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 18:46:35 +0100) * Upstream changes: - v0.0.43 - lister.pypi: Align lister with pypi package loader - lister.npm: Align lister with npm package loader - lister.tests: Avoid duplication setup step - Fix typos (and trailing ws) reported by codespell - Add a pre-commit config file -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 17:56:34 +0000 swh-lister (0.0.42-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.42 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 13:52:16 +0100) * Upstream changes: - v0.0.42 - cran/gnu: Rename task_type to load-archive-files - lister.tests: Add missing task_type for package listers - Migrate tox.ini to extras = xxx instead of deps = .[testing] - Merge tox environments - Include all requirements in MANIFEST.in - lister.cli: Remove task type register cli -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 13:00:29 +0000 swh-lister (0.0.41-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.41 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-15 12:02:13 +0100) * Upstream changes: - v0.0.41 - simple_lister: Flush to db more frequently - gnu.lister: Use url as primary key - gnu.lister.tests: Add missing assertion - gnu.lister: Add missing retries_left parameter - debian.models: Migrate tests from storage to debian lister model -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Nov 2019 11:06:35 +0000 swh-lister (0.0.40-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.40 - (tagged by Nicolas Dandrimont on 2019-11-13 13:54:38 +0100) * Upstream changes: - Release swh.lister 0.0.40 - Fix bogus NotImplementedError on Area.index_uris -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Nov 2019 13:02:08 +0000 swh-lister (0.0.39-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.39 - (tagged by Nicolas Dandrimont on 2019-11-13 13:23:31 +0100) * Upstream changes: - Release swh.lister 0.0.39 - Properly register all tasks - Fix up db_partition_indices to avoid expensive scans -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Nov 2019 12:28:33 +0000 swh-lister (0.0.38-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.38 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-06 15:55:46 +0100) * Upstream changes: - v0.0.38 - Remove swh.storage.schemata remnants -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Nov 2019 15:00:16 +0000 swh-lister (0.0.37-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.37 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-06 15:06:51 +0100) * Upstream changes: - v0.0.37 - Update swh-core dependency -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Nov 2019 14:18:31 +0000 swh-lister (0.0.36-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.36 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-06 11:33:33 +0100) * Upstream changes: - v0.0.36 - lister.*.tests: Add at least one integration test - gnu.lister: Move gnu listers specifity within the lister's scope - debian/lister: Use url parameter name instead of origin - debian/model: Install lister model within the lister repository - lister.*.tasks: Stop binding tasks to a specific instance of the - celery app - cran.lister: Refactor and fix cran lister - github/lister: Prevent erroneous scheduler tasks disabling - phabricator/lister: Fix lister - setup.py: Kill deprecated swh- lister command - Bootstrap typing annotations -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Nov 2019 10:55:41 +0000 swh-lister (0.0.35-1~swh4) unstable-swh; urgency=medium * Fix runtime dependencies -- Antoine R. Dumont (@ardumont) Wed, 11 Sep 2019 10:58:01 +0200 swh-lister (0.0.35-1~swh3) unstable-swh; urgency=medium * Bump dh-python to >= 3 for pybuild.testfiles. -- Nicolas Dandrimont Tue, 10 Sep 2019 14:58:11 +0200 swh-lister (0.0.35-1~swh2) unstable-swh; urgency=medium * Add egg-info to pybuild.testfiles. Close T1995. -- Nicolas Dandrimont Tue, 10 Sep 2019 14:36:22 +0200 swh-lister (0.0.35-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.35 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-09 12:14:42 +0200) * Upstream changes: - v0.0.35 - Fix debian package -- Software Heritage autobuilder (on jenkins-debian1) Mon, 09 Sep 2019 10:19:02 +0000 swh-lister (0.0.34-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.34 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-06 14:03:39 +0200) * Upstream changes: - v0.0.34 - listers: Implement listers as plugins - cgit: rewrite the CGit lister (and add more tests) - listers: simplify and unify constructor use - phabricator: randomly select the API token in the provided list - docs: Fix toc -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Sep 2019 12:09:13 +0000 swh-lister (0.0.33-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.33 - (tagged by Antoine R. Dumont (@ardumont) on 2019-08-29 10:23:20 +0200) * Upstream changes: - v0.0.33 - lister.cli: Allow to list forges with policy and priority - listers: Add New packagist lister - listers: Allow to override policy and priority for scheduled tasks - tests: Add tests to cli, pypi and improve lister core's - docs: Add code of conduct document -- Software Heritage autobuilder (on jenkins-debian1) Thu, 29 Aug 2019 08:28:23 +0000 swh-lister (0.0.32-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.32 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-28 18:21:50 +0200) * Upstream changes: - v0.0.32 - Clean up dead code - Add missing *.html sample for tests to run in packaging -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 Jun 2019 16:42:05 +0000 swh-lister (0.0.31-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.31 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-28 17:57:48 +0200) * Upstream changes: - v0.0.31 - Add cgit instance lister - Add back description in cran lister - Update contributors -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 Jun 2019 16:06:25 +0000 swh-lister (0.0.30-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.30 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-26 14:52:13 +0200) * Upstream changes: - v0.0.30 - Drop last description mentions for gitlab and cran listers. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 13:02:11 +0000 swh-lister (0.0.29-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.29 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-26 12:37:14 +0200) * Upstream changes: - v0.0.29 - lister: Fix bitbucket lister -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 10:47:20 +0000 swh-lister (0.0.28-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.28 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-20 12:00:09 +0200) * Upstream changes: - v0.0.28 - listers: Remove unused columns `origin_id` / `description` - gnu-lister: Use origin-type as 'tar' (and not 'gnu') - phabricator: Remove unused code -- Software Heritage autobuilder (on jenkins-debian1) Thu, 20 Jun 2019 10:07:48 +0000 swh-lister (0.0.27-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.27 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-18 10:27:09 +0200) * Upstream changes: - v0.0.27 - Unify lister tablenames to use consistently singular - Add missing instance field to phabricator repository model -- Software Heritage autobuilder (on jenkins-debian1) Tue, 18 Jun 2019 08:44:38 +0000 swh-lister (0.0.26-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.26 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-17 17:53:33 +0200) * Upstream changes: - v0.0.26 - phabricator.lister: Use credentials setup from configuration file - gitlab.lister: Remove request_params method override -- Software Heritage autobuilder (on jenkins-debian1) Mon, 17 Jun 2019 16:05:05 +0000 swh-lister (0.0.25-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.25 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-13 15:54:42 +0200) * Upstream changes: - v0.0.25 - Add new cran lister - listers: Stop creating origins when scheduling new tasks -- Software Heritage autobuilder (on jenkins-debian1) Thu, 13 Jun 2019 13:59:30 +0000 swh-lister (0.0.24-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.24 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-12 12:02:54 +0200) * Upstream changes: - v0.0.24 - swh.lister.gnu: Add new gnu lister -- Software Heritage autobuilder (on jenkins-debian1) Wed, 12 Jun 2019 10:10:56 +0000 swh-lister (0.0.23-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.23 - (tagged by Antoine R. Dumont (@ardumont) on 2019-05-29 14:04:22 +0200) * Upstream changes: - v0.0.23 - lister: Unify credentials structure between listers -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 May 2019 12:10:51 +0000 swh-lister (0.0.22-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.22 - (tagged by Antoine Lambert on 2019-05-23 10:59:39 +0200) * Upstream changes: - version 0.0.22 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 May 2019 09:05:34 +0000 swh-lister (0.0.21-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.21 - (tagged by Antoine Lambert on 2019-04-11 11:00:55 +0200) * Upstream changes: - version 0.0.21 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Apr 2019 09:05:30 +0000 swh-lister (0.0.20-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.20 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-14 10:50:06 +0100) * Upstream changes: - v0.0.20 - d/*: debian packaging files migrated to separated branches - lister.cli: Fix spelling typo -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Feb 2019 09:59:29 +0000 swh-lister (0.0.19-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.19 - (tagged by David Douard on 2019-02-07 17:36:33 +0100) * Upstream changes: - v0.0.19 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 07 Feb 2019 16:42:39 +0000 swh-lister (0.0.18-1~swh1) unstable-swh; urgency=medium * v0.0.18 * docs: add title and brief module description * gitlab.lister: Break asap when problem exists during fetch info * gitlab.lister: Do not expect gitlab instances to have credentials * setup: prepare for pypi upload * gitlab/models.py: drop unused import -- Antoine R. Dumont (@ardumont) Mon, 08 Oct 2018 15:54:12 +0200 swh-lister (0.0.17-1~swh1) unstable-swh; urgency=medium * v0.0.17 * Change pypi project url to use the /project api -- Antoine R. Dumont (@ardumont) Tue, 18 Sep 2018 11:35:25 +0200 swh-lister (0.0.16-1~swh1) unstable-swh; urgency=medium * v0.0.16 * Normalize PyPI name -- Antoine R. Dumont (@ardumont) Fri, 14 Sep 2018 13:25:56 +0200 swh-lister (0.0.15-1~swh1) unstable-swh; urgency=medium * v0.0.15 * Add pypi lister -- Antoine R. Dumont (@ardumont) Thu, 06 Sep 2018 17:09:25 +0200 swh-lister (0.0.14-1~swh1) unstable-swh; urgency=medium * v0.0.14 * core.lister_base: Batch create origins (storage) & tasks (scheduler) * swh.lister.cli: Add debian lister to the list of supported listers * README.md: Update to demo the lister debian run -- Antoine R. Dumont (@ardumont) Tue, 31 Jul 2018 15:46:12 +0200 swh-lister (0.0.13-1~swh1) unstable-swh; urgency=medium * v0.0.13 * Fix missing use cases when unable to retrieve information from the api * server * gitlab/lister: Allow specifying the number of elements to * read (default is 20, same as the current gitlab api) -- Antoine R. Dumont (@ardumont) Fri, 20 Jul 2018 13:46:04 +0200 swh-lister (0.0.12-1~swh1) unstable-swh; urgency=medium * v0.0.12 * swh.lister.gitlab.tasks: Use gitlab as instance name for gitlab.com * README.md: Add gitlab to the lister implementations referenced * core/lister_base: Remove unused import -- Antoine R. Dumont (@ardumont) Thu, 19 Jul 2018 11:29:14 +0200 swh-lister (0.0.11-1~swh1) unstable-swh; urgency=medium * v0.0.11 * lister/gitlab: Add gitlab lister * docs: Update documentation to demonstrate how to run a lister locally * core/lister: Make the listers' scheduler configuration adaptable * debian/*: Fix debian packaging tests -- Antoine R. Dumont (@ardumont) Wed, 18 Jul 2018 14:16:56 +0200 swh-lister (0.0.10-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.10 * Add missing task_queue attribute for debian listing tasks * Make sure tests run during build * Clean up runtime dependencies -- Nicolas Dandrimont Mon, 30 Oct 2017 17:37:25 +0100 swh-lister (0.0.9-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.9 * Add tasks for the Debian lister -- Nicolas Dandrimont Mon, 30 Oct 2017 14:20:58 +0100 swh-lister (0.0.8-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.8 * Add versioned dependency on sqlalchemy -- Nicolas Dandrimont Fri, 13 Oct 2017 12:15:38 +0200 swh-lister (0.0.7-1~swh1) unstable-swh; urgency=medium * Release swh.lister version 0.0.7 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:52 +0200 swh-lister (0.0.6-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.6 * Add new debian lister -- Nicolas Dandrimont Wed, 11 Oct 2017 17:59:47 +0200 swh-lister (0.0.5-1~swh1) unstable-swh; urgency=medium * Release swh.lister 0.0.5 * Make the lister more generic * Add bitbucket lister * Update tasks to new swh.scheduler API -- Nicolas Dandrimont Mon, 12 Jun 2017 18:22:13 +0200 swh-lister (0.0.4-1~swh1) unstable-swh; urgency=medium * v0.0.4 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 19:07:24 +0100 swh-lister (0.0.3-1~swh1) unstable-swh; urgency=medium * Release swh.lister.github v0.0.3 * Generate swh.scheduler tasks and swh.storage origins on the fly * Use celery tasks to schedule own work -- Nicolas Dandrimont Thu, 20 Oct 2016 17:30:39 +0200 swh-lister (0.0.2-1~swh1) unstable-swh; urgency=medium * Release swh.lister.github 0.0.2 * Move constants to a constants module to avoid circular imports -- Nicolas Dandrimont Thu, 17 Mar 2016 20:35:11 +0100 swh-lister (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * Release swh.lister.github v0.0.1 -- Nicolas Dandrimont Thu, 17 Mar 2016 19:01:20 +0100 diff --git a/requirements-swh.txt b/requirements-swh.txt index b451cdb..3281b3e 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db] >= 0.9 +swh.core[db,github] >= 2.8 swh.scheduler >= 0.8 diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 6687f51..4c7c008 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,127 +1,123 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.9.2 +Version: 2.9.3 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. - - diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt index 6caae43..5e69dc1 100644 --- a/swh.lister.egg-info/requires.txt +++ b/swh.lister.egg-info/requires.txt @@ -1,19 +1,19 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity>=6.2 xmltodict lxml -swh.core[db]>=0.9 +swh.core[db,github]>=2.8 swh.scheduler>=0.8 [testing] pytest pytest-mock requests_mock types-click types-pyyaml types-requests diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 2655744..acef224 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,208 +1,208 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import datetime import logging from typing import Any, Dict, Iterator, List, Optional, Set from urllib.parse import parse_qs, urlparse import iso8601 +from swh.core.github.utils import GitHubSession, MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister -from .utils import GitHubSession, MissingRateLimitReset logger = logging.getLogger(__name__) @dataclass class GitHubListerState: """State of the GitHub lister""" last_seen_id: int = 0 """Numeric id of the last repository listed on an incremental pass""" class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): """List origins from GitHub. By default, the lister runs in incremental mode: it lists all repositories, starting with the `last_seen_id` stored in the scheduler backend. Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in that mode, the lister finds the origins present in the range **excluding** `first_id` and **including** `last_id`. In this mode, the lister can overrun the `last_id`: it will always record all the origins seen in a given page. As the lister is fully idempotent, this is not a practical problem. Once relisting completes, the lister state in the scheduler backend is not updated. When the config contains a set of credentials, we shuffle this list at the beginning of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the same token over and over again, until its rate limit runs out. Once that happens, we switch to the next token over in our shuffled list. When a request fails with a rate limit exception for all tokens, we pause the listing until the largest value for X-Ratelimit-Reset over all tokens. When the credentials aren't set in the lister config, the lister can run in anonymous mode too (e.g. for testing purposes). .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits Args: first_id: the id of the first repo to list last_id: stop listing after seeing a repo with an id higher than this value. """ # noqa: B950 LISTER_NAME = "github" API_URL = "https://api.github.com/repositories" PAGE_SIZE = 1000 def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, first_id: Optional[int] = None, last_id: Optional[int] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance="github", ) self.first_id = first_id self.last_id = last_id self.relisting = self.first_id is not None or self.last_id is not None self.github_session = GitHubSession( credentials=self.credentials, user_agent=USER_AGENT ) def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: return GitHubListerState(**d) def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]: return asdict(state) def get_pages(self) -> Iterator[List[Dict[str, Any]]]: current_id = 0 if self.first_id is not None: current_id = self.first_id elif self.state is not None: current_id = self.state.last_seen_id current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}" while self.last_id is None or current_id < self.last_id: logger.debug("Getting page %s", current_url) try: response = self.github_session.request(current_url) except MissingRateLimitReset: # Give up break # We've successfully retrieved a (non-ratelimited) `response`. We # still need to check it for validity. if response.status_code != 200: logger.warning( "Got unexpected status_code %s: %s", response.status_code, response.content, ) break yield response.json() if "next" not in response.links: # No `next` link, we've reached the end of the world logger.debug( "No next link found in the response headers, all caught up" ) break # GitHub strongly advises to use the next link directly. We still # parse it to get the id of the last repository we've reached so # far. next_url = response.links["next"]["url"] parsed_url = urlparse(next_url) if not parsed_url.query: logger.warning("Failed to parse url %s", next_url) break parsed_query = parse_qs(parsed_url.query) current_id = int(parsed_query["since"][0]) current_url = next_url def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of GitHub repositories into a list of ListedOrigins. This records the html_url, as well as the pushed_at value if it exists. """ assert self.lister_obj.id is not None seen_in_page: Set[str] = set() for repo in page: if not repo: # null repositories in listings happen sometimes... continue if repo["html_url"] in seen_in_page: continue seen_in_page.add(repo["html_url"]) pushed_at_str = repo.get("pushed_at") pushed_at: Optional[datetime.datetime] = None if pushed_at_str: pushed_at = iso8601.parse_date(pushed_at_str) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["html_url"], visit_type="git", last_update=pushed_at, ) def commit_page(self, page: List[Dict[str, Any]]): """Update the currently stored state using the latest listed page""" if self.relisting: # Don't update internal state when relisting return if not page: # Sometimes, when you reach the end of the world, GitHub returns an empty # page of repositories return last_id = page[-1]["id"] if last_id > self.state.last_seen_id: self.state.last_seen_id = last_id def finalize(self): if self.relisting: return # Pull fresh lister state from the scheduler backend scheduler_state = self.get_state_from_scheduler() # Update the lister state in the backend only if the last seen id of # the current run is higher than that stored in the database. if self.state.last_seen_id > scheduler_state.last_seen_id: self.updated = True diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py index 2c874ae..88c5bf4 100644 --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -1,418 +1,245 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging -import time -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List import pytest import requests_mock +from swh.core.github.pytest_plugin import github_response_callback from swh.lister.github.lister import GitHubLister from swh.lister.pattern import CredentialsType, ListerStats from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import Lister NUM_PAGES = 10 ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES -def github_repo(i: int) -> Dict[str, Union[int, str]]: - """Basic repository information returned by the GitHub API""" - - repo: Dict[str, Union[int, str]] = { - "id": i, - "html_url": f"https://github.com/origin/{i}", - } - - # Set the pushed_at date on one of the origins - if i == 4321: - repo["pushed_at"] = "2018-11-08T13:16:24Z" - - return repo - - -def github_response_callback( - request: requests_mock.request._RequestObjectProxy, - context: requests_mock.response._Context, -) -> List[Dict[str, Union[str, int]]]: - """Return minimal GitHub API responses for the common case where the loader - hasn't been rate-limited""" - # Check request headers - assert request.headers["Accept"] == "application/vnd.github.v3+json" - assert "Software Heritage Lister" in request.headers["User-Agent"] - - # Check request parameters: per_page == 1000, since = last_repo_id - assert "per_page" in request.qs - assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)] - assert "since" in request.qs - - since = int(request.qs["since"][0]) - - next_page = since + GitHubLister.PAGE_SIZE - if next_page < ORIGIN_COUNT: - # the first id for the next page is within our origin count; add a Link - # header to the response - next_url = ( - GitHubLister.API_URL - + f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}" - ) - context.headers["Link"] = f"<{next_url}>; rel=next" - - return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)] - - @pytest.fixture() def requests_mocker() -> Iterator[requests_mock.Mocker]: with requests_mock.Mocker() as mock: mock.get(GitHubLister.API_URL, json=github_response_callback) yield mock def get_lister_data(swh_scheduler: SchedulerInterface) -> Lister: """Retrieve the data for the GitHub Lister""" return swh_scheduler.get_or_create_lister(name="github", instance_name="github") def set_lister_state(swh_scheduler: SchedulerInterface, state: Dict[str, Any]) -> None: """Set the state of the lister in database""" lister = swh_scheduler.get_or_create_lister(name="github", instance_name="github") lister.current_state = state swh_scheduler.update_lister(lister) def check_origin_4321(swh_scheduler: SchedulerInterface, lister: Lister) -> None: """Check that origin 4321 exists and has the proper last_update timestamp""" origin_4321_req = swh_scheduler.get_listed_origins( url="https://github.com/origin/4321" ) assert len(origin_4321_req.results) == 1 origin_4321 = origin_4321_req.results[0] assert origin_4321.lister_id == lister.id assert origin_4321.visit_type == "git" assert origin_4321.last_update == datetime.datetime( 2018, 11, 8, 13, 16, 24, tzinfo=datetime.timezone.utc ) def check_origin_5555(swh_scheduler: SchedulerInterface, lister: Lister) -> None: """Check that origin 5555 exists and has no last_update timestamp""" origin_5555_req = swh_scheduler.get_listed_origins( url="https://github.com/origin/5555" ) assert len(origin_5555_req.results) == 1 origin_5555 = origin_5555_req.results[0] assert origin_5555.lister_id == lister.id assert origin_5555.visit_type == "git" assert origin_5555.last_update is None def test_from_empty_state( swh_scheduler, caplog, requests_mocker: requests_mock.Mocker ) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Run the lister in incremental mode lister = GitHubLister(scheduler=swh_scheduler) res = lister.run() assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) listed_origins = swh_scheduler.get_listed_origins(limit=ORIGIN_COUNT + 1) assert len(listed_origins.results) == ORIGIN_COUNT assert listed_origins.next_page_token is None lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT} check_origin_4321(swh_scheduler, lister_data) check_origin_5555(swh_scheduler, lister_data) def test_incremental(swh_scheduler, caplog, requests_mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Number of origins to skip skip_origins = 2000 expected_origins = ORIGIN_COUNT - skip_origins # Bump the last_seen_id in the scheduler backend set_lister_state(swh_scheduler, {"last_seen_id": skip_origins}) # Run the lister in incremental mode lister = GitHubLister(scheduler=swh_scheduler) res = lister.run() # add 1 page to the number of full_pages if partial_page_len is not 0 full_pages, partial_page_len = divmod(expected_origins, GitHubLister.PAGE_SIZE) expected_pages = full_pages + bool(partial_page_len) assert res == ListerStats(pages=expected_pages, origins=expected_origins) listed_origins = swh_scheduler.get_listed_origins(limit=expected_origins + 1) assert len(listed_origins.results) == expected_origins assert listed_origins.next_page_token is None lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT} check_origin_4321(swh_scheduler, lister_data) check_origin_5555(swh_scheduler, lister_data) def test_relister(swh_scheduler, caplog, requests_mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Only set this state as a canary: in the currently tested mode, the lister # should not be touching it. set_lister_state(swh_scheduler, {"last_seen_id": 123}) # Use "relisting" mode to list origins between id 10 and 1011 lister = GitHubLister(scheduler=swh_scheduler, first_id=10, last_id=1011) res = lister.run() # Make sure we got two full pages of results assert res == ListerStats(pages=2, origins=2000) # Check that the relisting mode hasn't touched the stored state. lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": 123} -def github_ratelimit_callback( - request: requests_mock.request._RequestObjectProxy, - context: requests_mock.response._Context, - ratelimit_reset: Optional[int], -) -> Dict[str, str]: - """Return a rate-limited GitHub API response.""" - # Check request headers - assert request.headers["Accept"] == "application/vnd.github.v3+json" - assert "Software Heritage Lister" in request.headers["User-Agent"] - if "Authorization" in request.headers: - context.status_code = 429 - else: - context.status_code = 403 - - if ratelimit_reset is not None: - context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset) - - return { - "message": "API rate limit exceeded for .", - "documentation_url": "https://developer.github.com/v3/#rate-limiting", - } - - -@pytest.fixture() -def num_before_ratelimit() -> int: - """Number of successful requests before the ratelimit hits""" - return 0 - - -@pytest.fixture() -def num_ratelimit() -> Optional[int]: - """Number of rate-limited requests; None means infinity""" - return None - - -@pytest.fixture() -def ratelimit_reset() -> Optional[int]: - """Value of the X-Ratelimit-Reset header on ratelimited responses""" - return None - - -@pytest.fixture() -def requests_ratelimited( - num_before_ratelimit: int, - num_ratelimit: Optional[int], - ratelimit_reset: Optional[int], -) -> Iterator[requests_mock.Mocker]: - """Mock requests to the GitHub API, returning a rate-limiting status code - after `num_before_ratelimit` requests. - - GitHub does inconsistent rate-limiting: - - Anonymous requests return a 403 status code - - Authenticated requests return a 429 status code, with an - X-Ratelimit-Reset header. - - This fixture takes multiple arguments (which can be overridden with a - :func:`pytest.mark.parametrize` parameter): - - num_before_ratelimit: the global number of requests until the - ratelimit triggers - - num_ratelimit: the number of requests that return a - rate-limited response. - - ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the - request is authenticated. - - The default values set in the previous fixtures make all requests return a rate - limit response. - """ - current_request = 0 - - def response_callback(request, context): - nonlocal current_request - current_request += 1 - if num_before_ratelimit < current_request and ( - num_ratelimit is None - or current_request < num_before_ratelimit + num_ratelimit + 1 - ): - return github_ratelimit_callback(request, context, ratelimit_reset) - else: - return github_response_callback(request, context) - - with requests_mock.Mocker() as mock: - mock.get(GitHubLister.API_URL, json=response_callback) - yield mock - - def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None: - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler) assert lister.github_session.anonymous assert "using anonymous mode" in caplog.records[-1].message caplog.clear() res = lister.run() assert res == ListerStats(pages=0, origins=0) last_log = caplog.records[-1] assert last_log.levelname == "WARNING" assert "No X-Ratelimit-Reset value found in responses" in last_log.message -@pytest.fixture -def github_credentials() -> List[Dict[str, str]]: - """Return a static list of GitHub credentials""" - return sorted( - [{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)] - + [ - {"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"} - for i in range(3) - ], - key=lambda c: c["username"], - ) - - -@pytest.fixture -def all_tokens(github_credentials) -> List[str]: - """Return the list of tokens matching the static credential""" - - return [t.get("token", t.get("password")) for t in github_credentials] - - @pytest.fixture def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType: """Return the credentials formatted for use by the lister""" return {"github": {"github": github_credentials}} def test_authenticated_credentials( swh_scheduler, caplog, github_credentials, lister_credentials, all_tokens ): """Test credentials management when the lister is authenticated""" caplog.set_level(logging.DEBUG, "swh.lister.github.lister") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) assert lister.github_session.token_index == 0 assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials assert lister.github_session.session.headers["Authorization"] in [ "token %s" % t for t in all_tokens ] -def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None): - """Record calls to time.sleep in the sleep_calls list""" - if duration < 0: - raise ValueError("Can't sleep for a negative amount of time!") - if sleep_calls is not None: - sleep_calls.append(duration) - - -def fake_time_time(): - """Return 0 when running time.time()""" - return 0 - - -@pytest.fixture -def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]: - """Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments - passed to time.sleep().""" - sleeps: List[float] = [] - monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps)) - monkeypatch.setattr(time, "time", fake_time_time) - yield sleeps - - @pytest.mark.parametrize( "num_ratelimit", [1] ) # return a single rate-limit response, then continue def test_ratelimit_once_recovery( swh_scheduler, caplog, requests_ratelimited, num_ratelimit, monkeypatch_sleep_calls, lister_credentials, ): """Check that the lister recovers from hitting the rate-limit once""" - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) res = lister.run() # check that we used all the pages assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) token_users = [] for record in caplog.records: if "Using authentication token" in record.message: token_users.append(record.args[0]) # check that we used one more token than we saw rate limited requests assert len(token_users) == 1 + num_ratelimit # check that we slept for one second between our token uses assert monkeypatch_sleep_calls == [1] @pytest.mark.parametrize( # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a # set value for X-Ratelimit-Reset, then resume listing successfully. "num_before_ratelimit, num_ratelimit, ratelimit_reset", [(5, 6, 123456)], ) def test_ratelimit_reset_sleep( swh_scheduler, caplog, requests_ratelimited, monkeypatch_sleep_calls, num_before_ratelimit, ratelimit_reset, github_credentials, lister_credentials, ): """Check that the lister properly handles rate-limiting when providing it with authentication tokens""" - caplog.set_level(logging.DEBUG, "swh.lister.github.utils") + caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) res = lister.run() assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) # We sleep 1 second every time we change credentials, then we sleep until # ratelimit_reset + 1 expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1] assert monkeypatch_sleep_calls == expected_sleep_calls found_exhaustion_message = False for record in caplog.records: if record.levelname == "INFO": if "Rate limits exhausted for all tokens" in record.message: found_exhaustion_message = True break assert found_exhaustion_message diff --git a/swh/lister/github/utils.py b/swh/lister/github/utils.py index 269d432..df6088e 100644 --- a/swh/lister/github/utils.py +++ b/swh/lister/github/utils.py @@ -1,170 +1,6 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging -import random -import time -from typing import Dict, List, Optional - -import requests -from tenacity import ( - retry, - retry_any, - retry_if_exception_type, - retry_if_result, - wait_exponential, -) - -logger = logging.getLogger(__name__) - - -class RateLimited(Exception): - def __init__(self, response): - self.reset_time: Optional[int] - - # Figure out how long we need to sleep because of that rate limit - ratelimit_reset = response.headers.get("X-Ratelimit-Reset") - retry_after = response.headers.get("Retry-After") - if ratelimit_reset is not None: - self.reset_time = int(ratelimit_reset) - elif retry_after is not None: - self.reset_time = int(time.time()) + int(retry_after) + 1 - else: - logger.warning( - "Received a rate-limit-like status code %s, but no rate-limit " - "headers set. Response content: %s", - response.status_code, - response.content, - ) - self.reset_time = None - self.response = response - - -class MissingRateLimitReset(Exception): - pass - - -class GitHubSession: - """Manages a :class:`requests.Session` with (optionally) multiple credentials, - and cycles through them when reaching rate-limits.""" - - def __init__( - self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None - ) -> None: - """Initialize a requests session with the proper headers for requests to - GitHub.""" - self.credentials = credentials - if self.credentials: - random.shuffle(self.credentials) - - self.session = requests.Session() - - self.session.headers.update( - {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent} - ) - - self.anonymous = not self.credentials - - if self.anonymous: - logger.warning("No tokens set in configuration, using anonymous mode") - - self.token_index = -1 - self.current_user: Optional[str] = None - - if not self.anonymous: - # Initialize the first token value in the session headers - self.set_next_session_token() - - def set_next_session_token(self) -> None: - """Update the current authentication token with the next one in line.""" - - assert self.credentials - - self.token_index = (self.token_index + 1) % len(self.credentials) - - auth = self.credentials[self.token_index] - - self.current_user = auth["username"] - logger.debug("Using authentication token for user %s", self.current_user) - - if "password" in auth: - token = auth["password"] - else: - token = auth["token"] - - self.session.headers.update({"Authorization": f"token {token}"}) - - @retry( - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_any( - # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g. - # when running the lister on a connection with high latency - retry_if_exception_type(requests.exceptions.ChunkedEncodingError), - # 502 status codes happen for a Server Error, sometimes - retry_if_result(lambda r: r.status_code == 502), - ), - ) - def _request(self, url: str) -> requests.Response: - response = self.session.get(url) - - if ( - # GitHub returns inconsistent status codes between unauthenticated - # rate limit and authenticated rate limits. Handle both. - response.status_code == 429 - or (self.anonymous and response.status_code == 403) - ): - raise RateLimited(response) - - return response - - def request(self, url) -> requests.Response: - """Repeatedly requests the given URL, cycling through credentials and sleeping - if necessary; until either a successful response or :exc:`MissingRateLimitReset` - """ - # The following for/else loop handles rate limiting; if successful, - # it provides the rest of the function with a `response` object. - # - # If all tokens are rate-limited, we sleep until the reset time, - # then `continue` into another iteration of the outer while loop, - # attempting to get data from the same URL again. - - while True: - max_attempts = len(self.credentials) if self.credentials else 1 - reset_times: Dict[int, int] = {} # token index -> time - for attempt in range(max_attempts): - try: - return self._request(url) - except RateLimited as e: - reset_info = "(unknown reset)" - if e.reset_time is not None: - reset_times[self.token_index] = e.reset_time - reset_info = "(resetting in %ss)" % (e.reset_time - time.time()) - - if not self.anonymous: - logger.info( - "Rate limit exhausted for current user %s %s", - self.current_user, - reset_info, - ) - # Use next token in line - self.set_next_session_token() - # Wait one second to avoid triggering GitHub's abuse rate limits - time.sleep(1) - - # All tokens have been rate-limited. What do we do? - - if not reset_times: - logger.warning( - "No X-Ratelimit-Reset value found in responses for any token; " - "Giving up." - ) - raise MissingRateLimitReset() - - sleep_time = max(reset_times.values()) - time.time() + 1 - logger.info( - "Rate limits exhausted for all tokens. Sleeping for %f seconds.", - sleep_time, - ) - time.sleep(sleep_time) +from swh.core.github.utils import * # noqa diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 4ccc532..2dc6cc5 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,390 +1,425 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin import requests from tenacity.before_sleep import before_sleep_log import xmltodict +from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] +SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") + @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) self.jar_origins: Dict[str, ListedOrigin] = {} + self.github_session = GitHubSession( + credentials=self.credentials, user_agent=USER_AGENT + ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None response = requests.get(self.INDEX_URL, stream=True) if response.status_code != 200: logger.error("Index %s not found, stopping", self.INDEX_URL) response.raise_for_status() # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = doc_id else: m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin( self.BASE_URL, url_path, ) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom in out_pom: try: response = self.page_request(pom, {}) project = xmltodict.parse(response.content) project_d = project.get("project", {}) scm_d = project_d.get("scm") if scm_d is not None: connection = scm_d.get("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], "url": connection, } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", pom, ) except xmltodict.expat.ExpatError as error: logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: + """Retrieve scm origin out of the page information. Only called when type of the + page is scm. + + Try and detect an scm/vcs repository. Note that official format is in the form: + scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put + the repo url (without the "scm:type"), so we have to check against the content + to extract the type and url properly. + + Raises + AssertionError when the type of the page is not 'scm' + + Returns + ListedOrigin with proper canonical scm url (for github) if any is found, + None otherwise. + + """ + + assert page["type"] == "scm" + visit_type: Optional[str] = None + url: Optional[str] = None + m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) + if m_scm is None: + return None + + scm_type = m_scm.group("type") + if scm_type and scm_type in SUPPORTED_SCM_TYPES: + url = m_scm.group("url") + visit_type = scm_type + elif page["url"].endswith(".git"): + url = page["url"].lstrip("scm:") + visit_type = "git" + else: + return None + + if url and visit_type == "git": + # Non-github urls will be returned as is, github ones will be canonical ones + url = self.github_session.get_canonical_url(url) + + if not url: + return None + + assert visit_type is not None + assert self.lister_obj.id is not None + return ListedOrigin( + lister_id=self.lister_obj.id, + url=url, + visit_type=visit_type, + ) + def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: + """Convert a page of Maven repositories into a list of ListedOrigins.""" - assert self.lister_obj.id is not None - scm_types_ok = ("git", "svn", "hg", "cvs", "bzr") if page["type"] == "scm": - # If origin is a scm url: detect scm type and yield. - # Note that the official format is: - # scm:git:git://github.com/openengsb/openengsb-framework.git - # but many, many projects directly put the repo url, so we have to - # detect the content to match it properly. - m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) - if m_scm is not None: - scm_type = m_scm.group("type") - if scm_type in scm_types_ok: - scm_url = m_scm.group("url") - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=scm_url, - visit_type=scm_type, - ) - yield origin - else: - if page["url"].endswith(".git"): - origin = ListedOrigin( - lister_id=self.lister_obj.id, - url=page["url"], - visit_type="git", - ) - yield origin + listed_origin = self.get_scm(page) + if listed_origin: + yield listed_origin else: # Origin is gathering source archives: last_update_dt = None last_update_iso = "" try: last_update_seconds = str(page["time"])[:-3] last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) last_update_dt = last_update_dt.astimezone(timezone.utc) except (OverflowError, ValueError): logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: last_update_iso = last_update_dt.isoformat() # Origin URL will target page holding sources for all versions of # an artifactId (package name) inside a groupId (namespace) path = "/".join(page["gid"].split(".")) origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") artifact = { **{k: v for k, v in page.items() if k != "doc"}, "time": last_update_iso, "base_url": self.BASE_URL, } if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet + assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=page["type"], last_update=last_update_dt, extra_loader_arguments={"artifacts": [artifact]}, ) self.jar_origins[origin_url] = jar_origin else: # Update list of source artifacts for that origin otherwise jar_origin = self.jar_origins[origin_url] artifacts = jar_origin.extra_loader_arguments["artifacts"] if artifact not in artifacts: artifacts.append(artifact) if ( jar_origin.last_update and last_update_dt and last_update_dt > jar_origin.last_update ): jar_origin.last_update = last_update_dt if not self.incremental or ( self.state and page["doc"] > self.state.last_seen_doc ): # Yield origin with updated source artifacts, multiple instances of # ListedOrigin for the same origin URL but with different artifacts # list will be sent to the scheduler but it will deduplicate them and # take the latest one to upsert in database yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 331461e..b2a88f9 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,334 +1,353 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" -LIST_GIT = ( - "git://github.com/aldialimucaj/sprova4j.git", - "https://github.com/aldialimucaj/sprova4j.git", -) -LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",) +USER_REPO0 = "aldialimucaj/sprova4j" +GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" +GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" +LIST_GIT = (GIT_REPO_URL0_HTTPS,) + +USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" +GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" +GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" +GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" +LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() @pytest.fixture def maven_pom_1(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() @pytest.fixture def maven_pom_1_malformed(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes() @pytest.fixture def maven_pom_2(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() @pytest.fixture def maven_pom_3(datadir) -> bytes: return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() +@pytest.fixture +def requests_mock(requests_mock): + """If github api calls for the configured scm repository, returns its canonical url.""" + for url_api, url_html in [ + (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), + (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), + ]: + requests_mock.get( + url_api, + json={"html_url": url_html}, + ) + yield requests_mock + + @pytest.fixture(autouse=True) def network_requests_mock( requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): requests_mock.get(INDEX_URL, content=maven_index_full) requests_mock.get(URL_POM_1, content=maven_pom_1) requests_mock.get(URL_POM_2, content=maven_pom_2) requests_mock.get(URL_POM_3, content=maven_pom_3) def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) - assert len(origin_urls) == 4 + assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, maven_pom_1_malformed, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. requests_mock.get(URL_POM_1, content=maven_pom_1_malformed) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 - assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC) + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert len(origin_urls) == 2 - assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC) + assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 3 + assert len(scheduler_origins) == 2 def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32")) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == 3 + assert len(scheduler_origins) == 2