diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4507253..edfc845 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,45 +1,42 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 hooks: - id: trailing-whitespace - id: check-json - id: check-yaml - repo: https://gitlab.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 additional_dependencies: [flake8-bugbear==22.3.23] - repo: https://github.com/codespell-project/codespell rev: v2.1.0 hooks: - id: codespell name: Check source code spelling exclude: ^(swh/lister/.*/tests/data/.*)$ args: [-L crate] stages: [commit] - - id: codespell - name: Check commit message spelling - stages: [commit-msg] - repo: local hooks: - id: mypy name: mypy entry: mypy args: [swh] pass_filenames: false language: system types: [python] - repo: https://github.com/PyCQA/isort rev: 5.10.1 hooks: - id: isort - repo: https://github.com/python/black rev: 22.3.0 hooks: - id: black diff --git a/PKG-INFO b/PKG-INFO index c8f6500..178896e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,127 +1,127 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.8.1 +Version: 2.8.2 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/debian/changelog b/debian/changelog index 1c9c48a..48a347e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,1142 +1,1147 @@ -swh-lister (2.8.1-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-lister (2.8.2-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 2.8.2 - (tagged by Antoine R. Dumont + (@ardumont) on 2022-04-25 12:34:14 + +0200) + * Upstream changes: - v2.8.2 - sourceforge: Fix listing of bzr + projects - sourceforge: Do not consider Attic as a valid CVS + module - -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 14:05:42 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Mon, 25 Apr 2022 10:39:18 +0000 swh-lister (2.8.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.8.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-14 15:56:17 +0200) * Upstream changes: - v2.8.1 - maven: Fix argument of type 'NoneType' is not iterable -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 14:01:42 +0000 swh-lister (2.8.0-1~swh2) unstable-swh; urgency=medium * Bump new release (fix build dep) -- Antoine R. Dumont (@ardumont) Thu, 14 Apr 2022 14:51:05 +0200 swh-lister (2.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-14 11:42:16 +0200) * Upstream changes: - v2.8.0 - lister: Add new rust crates lister - maven: Continue listing if unable to retrieve pom information - maven: log error message when not able to retrieve the index to read -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 09:50:25 +0000 swh-lister (2.7.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.7.2 - (tagged by Antoine Lambert on 2022-03-11 13:34:15 +0100) * Upstream changes: - version 2.7.2 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 11 Mar 2022 12:38:38 +0000 swh-lister (2.7.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.7.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-18 10:42:52 +0100) * Upstream changes: - v2.7.1 - launchpad: Ignore erratic page and continue listing next page -- Software Heritage autobuilder (on jenkins-debian1) Fri, 18 Feb 2022 09:46:37 +0000 swh-lister (2.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-17 13:56:23 +0100) * Upstream changes: - v2.7.0 - launchpad: Allow bzr origins listing - launchpad: Manage unhandled exceptions when listing - sourceforge: Fix origin URLs for CVS projects -- Software Heritage autobuilder (on jenkins-debian1) Thu, 17 Feb 2022 13:02:22 +0000 swh-lister (2.6.4-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.4 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-14 16:57:38 +0100) * Upstream changes: - v2.6.4 - sourceforge: fix support for listing bzr origins -- Software Heritage autobuilder (on jenkins-debian1) Mon, 14 Feb 2022 16:01:23 +0000 swh-lister (2.6.3-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-09 17:20:28 +0100) * Upstream changes: - v2.6.3 - maven: Fix last update datetime -- Software Heritage autobuilder (on jenkins-debian1) Wed, 09 Feb 2022 16:24:11 +0000 swh-lister (2.6.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-08 10:39:05 +0100) * Upstream changes: - v2.6.2 - Remove no longer needed tenacity workarounds - maven: Fix undef last_update in ListedOrigins. - maven: dismiss origins if they are malformed - e.g. wrong pom scm format, add test. - maven: Let logging instruction do the formatting - maven: Add more debug logging instruction - maven: Pass the base URL of the Maven instance to the loader - docs: Fix ReST syntax and sphinx warnings - Pin mypy and drop type annotations which makes mypy unhappy - requirements-test: Pin pytest to < 7.0.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Feb 2022 09:43:37 +0000 swh-lister (2.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.1 - (tagged by Antoine Lambert on 2021-12-06 10:47:19 +0100) * Upstream changes: - version 2.6.1 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Dec 2021 09:51:07 +0000 swh-lister (2.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.0 - (tagged by Antoine Lambert on 2021-12-03 16:17:52 +0100) * Upstream changes: - version 2.6.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 15:22:00 +0000 swh-lister (2.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.0 - (tagged by Antoine Lambert on 2021-12-03 14:44:36 +0100) * Upstream changes: - version 2.5.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 13:48:49 +0000 swh-lister (2.4.0-1~swh3) unstable-swh; urgency=medium * Fix changelog error and actual correct release -- Antoine R. Dumont (@ardumont) Fri, 03 Dec 2021 12:45:00 +0100 swh.lister (2.4.0-1~swh2) unstable-swh; urgency=medium * Update missing deps and release -- Antoine R. Dumont (@ardumont) Fri, 03 Dec 2021 12:37:13 +0100 swh-lister (2.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-03 12:17:36 +0100) * Upstream changes: - v2.4.0 - debian: Update extra_loader_arguments dict produced ListedOrigin models - debian: Add missing file URIs in lister output - Deduplicate origins in the GitHub lister - lister: Add new maven lister -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 11:21:58 +0000 swh-lister (2.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.3.0 - (tagged by Valentin Lorentz on 2021-11-10 13:44:49 +0100) * Upstream changes: - v2.3.0 - * cran: Pass the package name to the loader -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Nov 2021 13:03:02 +0000 swh-lister (2.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.0 - (tagged by Antoine Lambert on 2021-10-22 15:16:48 +0200) * Upstream changes: - version 2.2.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Oct 2021 13:23:02 +0000 swh-lister (2.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-10-13 10:16:37 +0200) * Upstream changes: - v2.1.0 - Let sourceforge origins be listed "enabled" by default - docs: Add a save forge documentation - docs: Explain task type registering to complete the save forge doc -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Oct 2021 08:21:42 +0000 swh-lister (2.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-29 09:21:37 +0200) * Upstream changes: - v2.0.0 - opam: Share opam root directory even on multiple instances -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 Sep 2021 07:31:03 +0000 swh-lister (1.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-21 11:23:23 +0200) * Upstream changes: - v1.9.0 - gnu: Respect the pattern docstring about state initialization - opam: Allow defining where to actually install the opam_root folder - opam: Make the instance optional and derived from the url - opam: Move the state initialization into the get_pages method -- Software Heritage autobuilder (on jenkins-debian1) Tue, 21 Sep 2021 09:29:04 +0000 swh-lister (1.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-17 15:44:00 +0200) * Upstream changes: - v1.8.0 - Allow gitlab lister's name to be overridden by task arguments -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 13:47:58 +0000 swh-lister (1.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-17 13:37:22 +0200) * Upstream changes: - v1.7.0 - gitlab: Allow ingestion of hg_git origins as hg ones (some instance can list tose e.g - foss.heptapod.net) -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 11:41:52 +0000 swh-lister (1.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.6.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-17 10:50:28 +0200) * Upstream changes: - v1.6.0 - gitlab: Allow listing of instances providing multiple vcs_type -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Sep 2021 08:55:14 +0000 swh-lister (1.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-07-23 16:28:50 +0200) * Upstream changes: - v1.5.0 - gitlab: Handle HTTP status code 500 when listing projects - gitlab: Update requests query parameters - gitlab: Adapt requests retry policy to consider HTTP 50x status codes - opam: Directly use the --root flag instead of using an env variable - pattern: Use URL network location as instance name when not provided -- Software Heritage autobuilder (on jenkins-debian1) Fri, 23 Jul 2021 14:32:51 +0000 swh-lister (1.4.0-1~swh2) unstable-swh; urgency=medium * Bump new release -- Antoine R. Dumont (@ardumont) Fri, 09 Jul 2021 13:17:00 +0200 swh-lister (1.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-07-09 13:01:04 +0200) * Upstream changes: - v1.4.0 - New Tuleap lister - New Opam lister - Make PyPI lister incremental - Make PyPI lister complete the information on origins -- Software Heritage autobuilder (on jenkins-debian1) Fri, 09 Jul 2021 11:06:37 +0000 swh-lister (1.3.6-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.6 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-04 11:59:24 +0200) * Upstream changes: - v1.3.6 - sourceforge: use http:// for Mercurial (as workaround) -- Software Heritage autobuilder (on jenkins-debian1) Fri, 04 Jun 2021 10:03:14 +0000 swh-lister (1.3.5-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.5 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-03 10:22:17 +0200) * Upstream changes: - v1.3.5 - sourceforge: set the protocol for origin urls -- Software Heritage autobuilder (on jenkins-debian1) Thu, 03 Jun 2021 08:26:13 +0000 swh-lister (1.3.4-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-31 16:54:37 +0200) * Upstream changes: - v1.3.4 - Disable the sourceforge lister origins (so they can be listed) -- Software Heritage autobuilder (on jenkins-debian1) Mon, 31 May 2021 15:08:17 +0000 swh-lister (1.3.3-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.3 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-28 14:18:53 +0200) * Upstream changes: - v1.3.3 - cgit/lister: Fix error when a missing version is not provided -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 May 2021 12:39:52 +0000 swh-lister (1.3.2-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.2 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-26 12:43:45 +0200) * Upstream changes: - v1.3.2 - sourceforge: retry for all retryable exceptions -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 May 2021 10:48:22 +0000 swh-lister (1.3.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-19 11:25:59 +0200) * Upstream changes: - v1.3.1 - sourceforge: don't abort on error for project -- Software Heritage autobuilder (on jenkins-debian1) Wed, 19 May 2021 09:30:14 +0000 swh-lister (1.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-07 17:17:50 +0200) * Upstream changes: - v1.3.0 - sourceforge/tasks: Allow incremental listing - sourceforge/lister: Add credentials parameter -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 15:24:27 +0000 swh-lister (1.2.2-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.2 - (tagged by Antoine Lambert on 2021-05-07 14:43:24 +0200) * Upstream changes: - version 1.2.2 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 12:50:12 +0000 swh-lister (1.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.1 - (tagged by Antoine Lambert on 2021-05-07 14:10:36 +0200) * Upstream changes: - version 1.2.1 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 12:17:16 +0000 swh-lister (1.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-06 15:17:51 +0200) * Upstream changes: - v1.2.0 - Make the SourceForge lister incremental -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 May 2021 10:43:11 +0000 swh-lister (1.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.1.0 - (tagged by Antoine Lambert on 2021-04-29 14:29:27 +0200) * Upstream changes: - version 1.1.0 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 29 Apr 2021 12:33:59 +0000 swh-lister (1.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.0 - (tagged by Nicolas Dandrimont on 2021-03-22 10:56:04 +0100) * Upstream changes: - Release swh.lister v1.0.0 - All listers have been rewritten and are ready to be used in production - with the most recent version of the swh.scheduler APIs. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Mar 2021 10:13:35 +0000 swh-lister (0.10.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.10.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-03-01 09:59:16 +0100) * Upstream changes: - v0.10.0 - docs: Add new "howto write a lister tutorial" with unified lister api -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Mar 2021 09:01:54 +0000 swh-lister (0.9.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-08 14:09:27 +0100) * Upstream changes: - v0.9.1 - debian: Update archive mirror URL templates to process -- Software Heritage autobuilder (on jenkins-debian1) Mon, 08 Feb 2021 13:12:05 +0000 swh-lister (0.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-08 08:50:07 +0100) * Upstream changes: - v0.9.0 - docs: Update listers execution instructions - cran: Prevent multiple listing of an origin - cran: Add support for parsing date with milliseconds - pypi: Use BeautifulSoup for parsing HTML instead of xmltodict -- Software Heritage autobuilder (on jenkins-debian1) Mon, 08 Feb 2021 07:52:57 +0000 swh-lister (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-03 11:12:52 +0100) * Upstream changes: - v0.8.0 - packagist: Reimplement lister using new Lister API - gnu: Remove dependency on pytz - Remove no longer used models field in dict returned by register - Remove no longer used legacy Lister API and update CLI options -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Feb 2021 10:15:54 +0000 swh-lister (0.7.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.1 - (tagged by Vincent SELLIER on 2021-02-01 17:52:33 +0100) * Upstream changes: - v0.7.1 - * cgit: remove the repository urls's trailing / -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Feb 2021 16:56:35 +0000 swh-lister (0.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-01 09:31:30 +0100) * Upstream changes: - v0.7.0 - pattern: Bump packet split to chunk of 1000 records - cgit: Compute origin urls out of a base git url when provided. - gnu: Reimplement lister using new Lister API -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Feb 2021 08:35:14 +0000 swh-lister (0.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-29 09:07:21 +0100) * Upstream changes: - v0.6.1 - launchpad: Remove call to dataclasses.asdict on lister state - launchpad: Prevent error due to origin listed twice - Make debian lister constructors compatible with credentials - launchpad/tasks: Fix ping task function name - pattern: Make lister flush regularly origins to scheduler -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Jan 2021 08:11:13 +0000 swh-lister (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-28 15:48:32 +0100) * Upstream changes: - v0.6.0 - launchpad: Reimplement lister using new Lister API - Make stateless lister constructors compatible with credentials -- Software Heritage autobuilder (on jenkins-debian1) Thu, 28 Jan 2021 14:52:49 +0000 swh-lister (0.5.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-28 11:23:29 +0100) * Upstream changes: - v0.5.4 - gitlab: Deal with missing or trailing / in url input - tox.ini: Work around build failure due to upstream release -- Software Heritage autobuilder (on jenkins-debian1) Thu, 28 Jan 2021 10:27:59 +0000 swh-lister (0.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.2 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-27 17:19:10 +0100) * Upstream changes: - v0.5.2 - test_cli: Drop launchpad lister from the test_get_lister -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Jan 2021 16:25:31 +0000 swh-lister (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-27 16:39:20 +0100) * Upstream changes: - v0.5.1 - launchpad: Actually mock the anonymous login to launchpad - Drop no longer swh.lister.core.{indexing,page_by_page}_lister - tests: Drop unneeded reset instruction - cgit: Don't stop the listing when a repository page is not available -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Jan 2021 15:47:39 +0000 swh-lister (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-27 14:33:24 +0100) * Upstream changes: - v0.5.0 - cgit: Add support for last_update information during listing - Port Debian lister to new lister api - gitlab: Implement keyset-based pagination listing - cran: Retrieve last update date for each listed package - Port CRAN lister to new lister api - gitlab: Add support for last_update information during listing - Port Gitea lister to new lister api - Port cgit lister to the new lister api - bitbucket: Pick random credentials in configuration and improve logging - Port Gitlab lister to the new lister api - Port Npm lister to new lister api - Port PyPI lister to new lister api - Port Bitbucket lister to new lister api - Port Phabricator lister to new lister api - Port GitHub lister to new lister api - Introduce a simpler base pattern for lister implementations -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Jan 2021 13:40:34 +0000 swh-lister (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-23 15:47:05 +0100) * Upstream changes: - v0.4.0 - requirements: Rework dependencies - tests: Reduce db initialization fixtures to a minimum - Create listing task with a default of 3 if unspecified - lister.pytest_plugin: Simplify fixture setup - tests: Clarify listers test configuration -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Nov 2020 14:52:03 +0000 swh-lister (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-19 09:50:43 +0200) * Upstream changes: - v0.3.0 - lister.config: Adapt scheduler configuration structure - drop mock_get_scheduler which creates indirection for no good reason -- Software Heritage autobuilder (on jenkins-debian1) Mon, 19 Oct 2020 07:56:17 +0000 swh-lister (0.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-07 14:02:42 +0200) * Upstream changes: - v0.2.1 - lister_base: Drop leftover mixin SWHConfig which is no longer used -- Software Heritage autobuilder (on jenkins-debian1) Wed, 07 Oct 2020 12:07:43 +0000 swh-lister (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-06 09:33:33 +0200) * Upstream changes: - v0.2.0 - lister*: Migrate away from SWHConfig mixin - tox.ini: pin black to the pre-commit version (19.10b0) to avoid flip-flops - Run isort after the CLI import changes -- Software Heritage autobuilder (on jenkins-debian1) Tue, 06 Oct 2020 07:36:07 +0000 swh-lister (0.1.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.5 - (tagged by David Douard on 2020-09-25 11:51:57 +0200) * Upstream changes: - v0.1.5 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Sep 2020 09:55:44 +0000 swh-lister (0.1.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.4 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-10 11:32:46 +0200) * Upstream changes: - v0.1.4 - gitea.lister: Fix uid to be unique across instance - utils.split_range: Split into not overlapping ranges - gitea.tasks: Fix parameter name from 'sort' to 'order' -- Software Heritage autobuilder (on jenkins-debian1) Thu, 10 Sep 2020 09:35:53 +0000 swh-lister (0.1.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.3 - (tagged by Vincent SELLIER on 2020-09-08 14:48:08 +0200) * Upstream changes: - v0.1.3 - Launchpad: rename task name to match conventions - tests: Separate lister instantiations -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Sep 2020 12:53:22 +0000 swh-lister (0.1.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-02 13:07:30 +0200) * Upstream changes: - v0.1.2 - pytest_plugin: Instantiate only lister with no particular setup - pytest: Define plugin and declare it in the root conftest -- Software Heritage autobuilder (on jenkins-debian1) Wed, 02 Sep 2020 11:10:14 +0000 swh-lister (0.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-01 16:08:48 +0200) * Upstream changes: - v0.1.1 - test_cli: Exclude launchpad lister from the check -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Sep 2020 14:11:46 +0000 swh-lister (0.1.0-1~swh2) unstable-swh; urgency=medium * Update dependencies -- Antoine R. Dumont (@ardumont) Wed, 26 Aug 2020 16:05:03 +0000 swh-lister (0.1.0-1~swh1) unstable-swh; urgency=medium [ Nicolas Dandrimont ] * Use setuptools-scm instead of vcversioner [ Software Heritage autobuilder (on jenkins-debian1) ] * New upstream release 0.1.0 - (tagged by David Douard on 2020-08-25 18:33:55 +0200) * Upstream changes: - v0.1.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 25 Aug 2020 16:39:28 +0000 swh-lister (0.0.50-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.50 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-20 10:44:57 +0100) * Upstream changes: - v0.0.50 - github.lister: Filter out partial repositories which break listing - docs: Fix sphinx warnings - core.lister_base: Improve slightly docs and types -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 Jan 2020 09:51:23 +0000 swh-lister (0.0.49-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.49 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-17 14:20:35 +0100) * Upstream changes: - v0.0.49 - github.lister: Use Retry-After header when rate limit reached -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Jan 2020 13:27:56 +0000 swh-lister (0.0.48-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.48 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-16 13:56:12 +0100) * Upstream changes: - v0.0.48 - cran.lister: Use cran's canonical url for origin url - cran.lister: Version uid so we can list new package versions - cran.lister: Adapt docstring sample accordingly -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jan 2020 13:03:54 +0000 swh-lister (0.0.47-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.47 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-09 10:26:18 +0100) * Upstream changes: - v0.0.47 - cran.lister: Align loading tasks' with loader's expectation -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jan 2020 09:34:26 +0000 swh-lister (0.0.46-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.46 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-19 14:09:45 +0100) * Upstream changes: - v0.0.46 - lister.debian: Make debian init step idempotent and up-to-date - lister_base: Split into chunks the tasks prior to creation -- Software Heritage autobuilder (on jenkins-debian1) Thu, 19 Dec 2019 13:16:45 +0000 swh-lister (0.0.45-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.45 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-10 11:27:17 +0100) * Upstream changes: - v0.0.45 - core: Align listers' task output (hg/git tasks) with expected format - npm: Align lister's loader output tasks with expected format - lister/tasks: Standardize return statements -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 Dec 2019 10:32:45 +0000 swh-lister (0.0.44-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.44 - (tagged by Nicolas Dandrimont on 2019-11-22 16:15:54 +0100) * Upstream changes: - Release swh.lister v0.0.44 - Define proper User Agents everywhere -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 15:31:33 +0000 swh-lister (0.0.43-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.43 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 18:46:35 +0100) * Upstream changes: - v0.0.43 - lister.pypi: Align lister with pypi package loader - lister.npm: Align lister with npm package loader - lister.tests: Avoid duplication setup step - Fix typos (and trailing ws) reported by codespell - Add a pre-commit config file -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 17:56:34 +0000 swh-lister (0.0.42-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.42 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 13:52:16 +0100) * Upstream changes: - v0.0.42 - cran/gnu: Rename task_type to load-archive-files - lister.tests: Add missing task_type for package listers - Migrate tox.ini to extras = xxx instead of deps = .[testing] - Merge tox environments - Include all requirements in MANIFEST.in - lister.cli: Remove task type register cli -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 13:00:29 +0000 swh-lister (0.0.41-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.41 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-15 12:02:13 +0100) * Upstream changes: - v0.0.41 - simple_lister: Flush to db more frequently - gnu.lister: Use url as primary key - gnu.lister.tests: Add missing assertion - gnu.lister: Add missing retries_left parameter - debian.models: Migrate tests from storage to debian lister model -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Nov 2019 11:06:35 +0000 swh-lister (0.0.40-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.40 - (tagged by Nicolas Dandrimont on 2019-11-13 13:54:38 +0100) * Upstream changes: - Release swh.lister 0.0.40 - Fix bogus NotImplementedError on Area.index_uris -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Nov 2019 13:02:08 +0000 swh-lister (0.0.39-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.39 - (tagged by Nicolas Dandrimont on 2019-11-13 13:23:31 +0100) * Upstream changes: - Release swh.lister 0.0.39 - Properly register all tasks - Fix up db_partition_indices to avoid expensive scans -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Nov 2019 12:28:33 +0000 swh-lister (0.0.38-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.38 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-06 15:55:46 +0100) * Upstream changes: - v0.0.38 - Remove swh.storage.schemata remnants -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Nov 2019 15:00:16 +0000 swh-lister (0.0.37-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.37 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-06 15:06:51 +0100) * Upstream changes: - v0.0.37 - Update swh-core dependency -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Nov 2019 14:18:31 +0000 swh-lister (0.0.36-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.36 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-06 11:33:33 +0100) * Upstream changes: - v0.0.36 - lister.*.tests: Add at least one integration test - gnu.lister: Move gnu listers specifity within the lister's scope - debian/lister: Use url parameter name instead of origin - debian/model: Install lister model within the lister repository - lister.*.tasks: Stop binding tasks to a specific instance of the - celery app - cran.lister: Refactor and fix cran lister - github/lister: Prevent erroneous scheduler tasks disabling - phabricator/lister: Fix lister - setup.py: Kill deprecated swh- lister command - Bootstrap typing annotations -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Nov 2019 10:55:41 +0000 swh-lister (0.0.35-1~swh4) unstable-swh; urgency=medium * Fix runtime dependencies -- Antoine R. Dumont (@ardumont) Wed, 11 Sep 2019 10:58:01 +0200 swh-lister (0.0.35-1~swh3) unstable-swh; urgency=medium * Bump dh-python to >= 3 for pybuild.testfiles. -- Nicolas Dandrimont Tue, 10 Sep 2019 14:58:11 +0200 swh-lister (0.0.35-1~swh2) unstable-swh; urgency=medium * Add egg-info to pybuild.testfiles. Close T1995. -- Nicolas Dandrimont Tue, 10 Sep 2019 14:36:22 +0200 swh-lister (0.0.35-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.35 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-09 12:14:42 +0200) * Upstream changes: - v0.0.35 - Fix debian package -- Software Heritage autobuilder (on jenkins-debian1) Mon, 09 Sep 2019 10:19:02 +0000 swh-lister (0.0.34-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.34 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-06 14:03:39 +0200) * Upstream changes: - v0.0.34 - listers: Implement listers as plugins - cgit: rewrite the CGit lister (and add more tests) - listers: simplify and unify constructor use - phabricator: randomly select the API token in the provided list - docs: Fix toc -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Sep 2019 12:09:13 +0000 swh-lister (0.0.33-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.33 - (tagged by Antoine R. Dumont (@ardumont) on 2019-08-29 10:23:20 +0200) * Upstream changes: - v0.0.33 - lister.cli: Allow to list forges with policy and priority - listers: Add New packagist lister - listers: Allow to override policy and priority for scheduled tasks - tests: Add tests to cli, pypi and improve lister core's - docs: Add code of conduct document -- Software Heritage autobuilder (on jenkins-debian1) Thu, 29 Aug 2019 08:28:23 +0000 swh-lister (0.0.32-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.32 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-28 18:21:50 +0200) * Upstream changes: - v0.0.32 - Clean up dead code - Add missing *.html sample for tests to run in packaging -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 Jun 2019 16:42:05 +0000 swh-lister (0.0.31-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.31 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-28 17:57:48 +0200) * Upstream changes: - v0.0.31 - Add cgit instance lister - Add back description in cran lister - Update contributors -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 Jun 2019 16:06:25 +0000 swh-lister (0.0.30-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.30 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-26 14:52:13 +0200) * Upstream changes: - v0.0.30 - Drop last description mentions for gitlab and cran listers. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 13:02:11 +0000 swh-lister (0.0.29-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.29 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-26 12:37:14 +0200) * Upstream changes: - v0.0.29 - lister: Fix bitbucket lister -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 10:47:20 +0000 swh-lister (0.0.28-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.28 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-20 12:00:09 +0200) * Upstream changes: - v0.0.28 - listers: Remove unused columns `origin_id` / `description` - gnu-lister: Use origin-type as 'tar' (and not 'gnu') - phabricator: Remove unused code -- Software Heritage autobuilder (on jenkins-debian1) Thu, 20 Jun 2019 10:07:48 +0000 swh-lister (0.0.27-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.27 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-18 10:27:09 +0200) * Upstream changes: - v0.0.27 - Unify lister tablenames to use consistently singular - Add missing instance field to phabricator repository model -- Software Heritage autobuilder (on jenkins-debian1) Tue, 18 Jun 2019 08:44:38 +0000 swh-lister (0.0.26-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.26 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-17 17:53:33 +0200) * Upstream changes: - v0.0.26 - phabricator.lister: Use credentials setup from configuration file - gitlab.lister: Remove request_params method override -- Software Heritage autobuilder (on jenkins-debian1) Mon, 17 Jun 2019 16:05:05 +0000 swh-lister (0.0.25-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.25 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-13 15:54:42 +0200) * Upstream changes: - v0.0.25 - Add new cran lister - listers: Stop creating origins when scheduling new tasks -- Software Heritage autobuilder (on jenkins-debian1) Thu, 13 Jun 2019 13:59:30 +0000 swh-lister (0.0.24-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.24 - (tagged by Antoine R. Dumont (@ardumont) on 2019-06-12 12:02:54 +0200) * Upstream changes: - v0.0.24 - swh.lister.gnu: Add new gnu lister -- Software Heritage autobuilder (on jenkins-debian1) Wed, 12 Jun 2019 10:10:56 +0000 swh-lister (0.0.23-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.23 - (tagged by Antoine R. Dumont (@ardumont) on 2019-05-29 14:04:22 +0200) * Upstream changes: - v0.0.23 - lister: Unify credentials structure between listers -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 May 2019 12:10:51 +0000 swh-lister (0.0.22-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.22 - (tagged by Antoine Lambert on 2019-05-23 10:59:39 +0200) * Upstream changes: - version 0.0.22 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 May 2019 09:05:34 +0000 swh-lister (0.0.21-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.21 - (tagged by Antoine Lambert on 2019-04-11 11:00:55 +0200) * Upstream changes: - version 0.0.21 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Apr 2019 09:05:30 +0000 swh-lister (0.0.20-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.20 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-14 10:50:06 +0100) * Upstream changes: - v0.0.20 - d/*: debian packaging files migrated to separated branches - lister.cli: Fix spelling typo -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Feb 2019 09:59:29 +0000 swh-lister (0.0.19-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.19 - (tagged by David Douard on 2019-02-07 17:36:33 +0100) * Upstream changes: - v0.0.19 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 07 Feb 2019 16:42:39 +0000 swh-lister (0.0.18-1~swh1) unstable-swh; urgency=medium * v0.0.18 * docs: add title and brief module description * gitlab.lister: Break asap when problem exists during fetch info * gitlab.lister: Do not expect gitlab instances to have credentials * setup: prepare for pypi upload * gitlab/models.py: drop unused import -- Antoine R. Dumont (@ardumont) Mon, 08 Oct 2018 15:54:12 +0200 swh-lister (0.0.17-1~swh1) unstable-swh; urgency=medium * v0.0.17 * Change pypi project url to use the /project api -- Antoine R. Dumont (@ardumont) Tue, 18 Sep 2018 11:35:25 +0200 swh-lister (0.0.16-1~swh1) unstable-swh; urgency=medium * v0.0.16 * Normalize PyPI name -- Antoine R. Dumont (@ardumont) Fri, 14 Sep 2018 13:25:56 +0200 swh-lister (0.0.15-1~swh1) unstable-swh; urgency=medium * v0.0.15 * Add pypi lister -- Antoine R. Dumont (@ardumont) Thu, 06 Sep 2018 17:09:25 +0200 swh-lister (0.0.14-1~swh1) unstable-swh; urgency=medium * v0.0.14 * core.lister_base: Batch create origins (storage) & tasks (scheduler) * swh.lister.cli: Add debian lister to the list of supported listers * README.md: Update to demo the lister debian run -- Antoine R. Dumont (@ardumont) Tue, 31 Jul 2018 15:46:12 +0200 swh-lister (0.0.13-1~swh1) unstable-swh; urgency=medium * v0.0.13 * Fix missing use cases when unable to retrieve information from the api * server * gitlab/lister: Allow specifying the number of elements to * read (default is 20, same as the current gitlab api) -- Antoine R. Dumont (@ardumont) Fri, 20 Jul 2018 13:46:04 +0200 swh-lister (0.0.12-1~swh1) unstable-swh; urgency=medium * v0.0.12 * swh.lister.gitlab.tasks: Use gitlab as instance name for gitlab.com * README.md: Add gitlab to the lister implementations referenced * core/lister_base: Remove unused import -- Antoine R. Dumont (@ardumont) Thu, 19 Jul 2018 11:29:14 +0200 swh-lister (0.0.11-1~swh1) unstable-swh; urgency=medium * v0.0.11 * lister/gitlab: Add gitlab lister * docs: Update documentation to demonstrate how to run a lister locally * core/lister: Make the listers' scheduler configuration adaptable * debian/*: Fix debian packaging tests -- Antoine R. Dumont (@ardumont) Wed, 18 Jul 2018 14:16:56 +0200 swh-lister (0.0.10-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.10 * Add missing task_queue attribute for debian listing tasks * Make sure tests run during build * Clean up runtime dependencies -- Nicolas Dandrimont Mon, 30 Oct 2017 17:37:25 +0100 swh-lister (0.0.9-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.9 * Add tasks for the Debian lister -- Nicolas Dandrimont Mon, 30 Oct 2017 14:20:58 +0100 swh-lister (0.0.8-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.8 * Add versioned dependency on sqlalchemy -- Nicolas Dandrimont Fri, 13 Oct 2017 12:15:38 +0200 swh-lister (0.0.7-1~swh1) unstable-swh; urgency=medium * Release swh.lister version 0.0.7 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:52 +0200 swh-lister (0.0.6-1~swh1) unstable-swh; urgency=medium * Release swh.lister v0.0.6 * Add new debian lister -- Nicolas Dandrimont Wed, 11 Oct 2017 17:59:47 +0200 swh-lister (0.0.5-1~swh1) unstable-swh; urgency=medium * Release swh.lister 0.0.5 * Make the lister more generic * Add bitbucket lister * Update tasks to new swh.scheduler API -- Nicolas Dandrimont Mon, 12 Jun 2017 18:22:13 +0200 swh-lister (0.0.4-1~swh1) unstable-swh; urgency=medium * v0.0.4 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 19:07:24 +0100 swh-lister (0.0.3-1~swh1) unstable-swh; urgency=medium * Release swh.lister.github v0.0.3 * Generate swh.scheduler tasks and swh.storage origins on the fly * Use celery tasks to schedule own work -- Nicolas Dandrimont Thu, 20 Oct 2016 17:30:39 +0200 swh-lister (0.0.2-1~swh1) unstable-swh; urgency=medium * Release swh.lister.github 0.0.2 * Move constants to a constants module to avoid circular imports -- Nicolas Dandrimont Thu, 17 Mar 2016 20:35:11 +0100 swh-lister (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * Release swh.lister.github v0.0.1 -- Nicolas Dandrimont Thu, 17 Mar 2016 19:01:20 +0100 diff --git a/debian/control b/debian/control index 6c3a94a..1679381 100644 --- a/debian/control +++ b/debian/control @@ -1,37 +1,38 @@ Source: swh-lister Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 3), python3-all, python3-bs4, python3-debian, python3-iso8601, python3-launchpadlib, + python3-lxml, python3-testresources, python3-pytest, python3-pytest-mock, python3-requests, python3-requests-mock, python3-setuptools, python3-swh.core (>= 0.9), python3-swh.core.db.pytestplugin (>= 0.9), python3-swh.scheduler (>= 0.0.31~), python3-tenacity, python3-setuptools-scm, python3-xmltodict, opam, git, Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/source/swh-lister/ Package: python3-swh.lister Architecture: all Depends: python3-swh.core (>= 0.9), python3-swh.scheduler (>= 0.0.31~), ${misc:Depends}, ${python3:Depends} Breaks: python3-swh.lister.github Replaces: python3-swh.lister.github Description: Software Heritage Listers (bitbucket, git(lab|hub), pypi, etc...) diff --git a/mypy.ini b/mypy.ini index b5a4295..eb2343b 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,42 +1,45 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-bs4.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-debian.*] ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-launchpadlib.*] ignore_missing_imports = True [mypy-lazr.*] ignore_missing_imports = True +[mypy-lxml.*] +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True [mypy-xmltodict.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index c57eecc..ea5ee0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity >= 6.2 xmltodict +lxml diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index c8f6500..178896e 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,127 +1,127 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.8.1 +Version: 2.8.2 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt index ed4710e..1e67e34 100644 --- a/swh.lister.egg-info/SOURCES.txt +++ b/swh.lister.egg-info/SOURCES.txt @@ -1,260 +1,263 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml ACKNOWLEDGEMENTS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.md conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/new_lister_template.py docs/run_a_new_lister.rst docs/save_forge.rst docs/tutorial.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/new_base.png docs/images/new_bitbucket_lister.png docs/images/new_github_lister.png docs/images/old_github_lister.png sql/crawler.sql sql/pimp_db.sql swh/__init__.py swh.lister.egg-info/PKG-INFO swh.lister.egg-info/SOURCES.txt swh.lister.egg-info/dependency_links.txt swh.lister.egg-info/entry_points.txt swh.lister.egg-info/requires.txt swh.lister.egg-info/top_level.txt swh/lister/__init__.py swh/lister/cli.py swh/lister/pattern.py swh/lister/py.typed swh/lister/utils.py swh/lister/bitbucket/__init__.py swh/lister/bitbucket/lister.py swh/lister/bitbucket/tasks.py swh/lister/bitbucket/tests/__init__.py swh/lister/bitbucket/tests/test_lister.py swh/lister/bitbucket/tests/test_tasks.py swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json swh/lister/bitbucket/tests/data/bb_api_repositories_page2.json swh/lister/cgit/__init__.py swh/lister/cgit/lister.py swh/lister/cgit/tasks.py swh/lister/cgit/tests/__init__.py swh/lister/cgit/tests/repo_list.txt swh/lister/cgit/tests/test_lister.py swh/lister/cgit/tests/test_tasks.py swh/lister/cgit/tests/data/https_git.baserock.org/cgit swh/lister/cgit/tests/data/https_git.eclipse.org/c swh/lister/cgit/tests/data/https_git.savannah.gnu.org/README swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit_elisp-es.git swh/lister/cgit/tests/data/https_git.tizen/README swh/lister/cgit/tests/data/https_git.tizen/cgit swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=100 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=50 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Projects swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Users swh/lister/cgit/tests/data/https_git.tizen/cgit_Lock-Projects swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-base swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-mc1n2 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e3250 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e4x12 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_devices_nfc-plugin-nxp swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_intel_mfld_bootstub-mfld-blackbay swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_mtdev swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_opengl-es-virtual-drv swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libdrm swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libnl swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_xorg_driver_xserver-xorg-misc swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-gallery-efl swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-homescreen-efl swh/lister/cgit/tests/data/https_jff.email/cgit swh/lister/cran/__init__.py swh/lister/cran/list_all_packages.R swh/lister/cran/lister.py swh/lister/cran/tasks.py swh/lister/cran/tests/__init__.py swh/lister/cran/tests/test_lister.py swh/lister/cran/tests/test_tasks.py swh/lister/cran/tests/data/list-r-packages.json swh/lister/crates/__init__.py swh/lister/crates/lister.py swh/lister/crates/tasks.py swh/lister/crates/tests/__init__.py swh/lister/crates/tests/test_lister.py swh/lister/crates/tests/test_tasks.py swh/lister/crates/tests/data/fake-crates-repository.tar.gz swh/lister/crates/tests/data/fake_crates_repository_init.sh swh/lister/debian/__init__.py swh/lister/debian/lister.py swh/lister/debian/tasks.py swh/lister/debian/tests/__init__.py swh/lister/debian/tests/test_lister.py swh/lister/debian/tests/test_tasks.py swh/lister/debian/tests/data/Sources_bullseye swh/lister/debian/tests/data/Sources_buster swh/lister/debian/tests/data/Sources_stretch swh/lister/gitea/__init__.py swh/lister/gitea/lister.py swh/lister/gitea/tasks.py swh/lister/gitea/tests/__init__.py swh/lister/gitea/tests/test_lister.py swh/lister/gitea/tests/test_tasks.py swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2 swh/lister/github/__init__.py swh/lister/github/lister.py swh/lister/github/tasks.py swh/lister/github/tests/__init__.py swh/lister/github/tests/test_lister.py swh/lister/github/tests/test_tasks.py swh/lister/gitlab/__init__.py swh/lister/gitlab/lister.py swh/lister/gitlab/tasks.py swh/lister/gitlab/tests/__init__.py swh/lister/gitlab/tests/test_lister.py swh/lister/gitlab/tests/test_tasks.py swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json swh/lister/gitlab/tests/data/https_gitlab.com/api_response_page1.json swh/lister/gnu/__init__.py swh/lister/gnu/lister.py swh/lister/gnu/tasks.py swh/lister/gnu/tree.py swh/lister/gnu/tests/__init__.py swh/lister/gnu/tests/test_lister.py swh/lister/gnu/tests/test_tasks.py swh/lister/gnu/tests/test_tree.py swh/lister/gnu/tests/data/tree.json swh/lister/gnu/tests/data/tree.min.json swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz swh/lister/launchpad/__init__.py swh/lister/launchpad/lister.py swh/lister/launchpad/tasks.py swh/lister/launchpad/tests/__init__.py swh/lister/launchpad/tests/conftest.py swh/lister/launchpad/tests/test_lister.py swh/lister/launchpad/tests/test_tasks.py swh/lister/launchpad/tests/data/launchpad_bzr_response.json swh/lister/launchpad/tests/data/launchpad_response1.json swh/lister/launchpad/tests/data/launchpad_response2.json swh/lister/maven/README.md swh/lister/maven/__init__.py swh/lister/maven/lister.py swh/lister/maven/tasks.py swh/lister/maven/tests/__init__.py swh/lister/maven/tests/test_lister.py swh/lister/maven/tests/test_tasks.py swh/lister/maven/tests/data/http_indexes/export.fld swh/lister/maven/tests/data/http_indexes/export_incr.fld swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/lister/npm/__init__.py swh/lister/npm/lister.py swh/lister/npm/tasks.py swh/lister/npm/tests/test_lister.py swh/lister/npm/tests/test_tasks.py swh/lister/npm/tests/data/npm_full_page1.json swh/lister/npm/tests/data/npm_full_page2.json swh/lister/npm/tests/data/npm_incremental_page1.json swh/lister/npm/tests/data/npm_incremental_page2.json swh/lister/opam/__init__.py swh/lister/opam/lister.py swh/lister/opam/tasks.py swh/lister/opam/tests/__init__.py swh/lister/opam/tests/test_lister.py swh/lister/opam/tests/test_tasks.py swh/lister/opam/tests/data/fake_opam_repo/repo swh/lister/opam/tests/data/fake_opam_repo/version swh/lister/opam/tests/data/fake_opam_repo/packages/agrid/agrid.0.1/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.1/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.2/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.3/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.4/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.5/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.6/opam swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.1/opam swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.2/opam swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.3/opam swh/lister/opam/tests/data/fake_opam_repo/packages/ocb/ocb.0.1/opam swh/lister/packagist/__init__.py swh/lister/packagist/lister.py swh/lister/packagist/tasks.py swh/lister/packagist/tests/__init__.py swh/lister/packagist/tests/test_lister.py swh/lister/packagist/tests/test_tasks.py swh/lister/packagist/tests/data/den1n_contextmenu.json swh/lister/packagist/tests/data/ljjackson_linnworks.json swh/lister/packagist/tests/data/lky_wx_article.json swh/lister/packagist/tests/data/spryker-eco_computop-api.json swh/lister/phabricator/__init__.py swh/lister/phabricator/lister.py swh/lister/phabricator/tasks.py swh/lister/phabricator/tests/__init__.py swh/lister/phabricator/tests/test_lister.py swh/lister/phabricator/tests/test_tasks.py swh/lister/phabricator/tests/data/__init__.py swh/lister/phabricator/tests/data/phabricator_api_repositories_page1.json swh/lister/phabricator/tests/data/phabricator_api_repositories_page2.json swh/lister/pypi/__init__.py swh/lister/pypi/lister.py swh/lister/pypi/tasks.py swh/lister/pypi/tests/__init__.py swh/lister/pypi/tests/test_lister.py swh/lister/pypi/tests/test_tasks.py swh/lister/sourceforge/__init__.py swh/lister/sourceforge/lister.py swh/lister/sourceforge/tasks.py swh/lister/sourceforge/tests/__init__.py swh/lister/sourceforge/tests/test_lister.py swh/lister/sourceforge/tests/test_tasks.py swh/lister/sourceforge/tests/data/aaron.html swh/lister/sourceforge/tests/data/aaron.json swh/lister/sourceforge/tests/data/adobexmp.json swh/lister/sourceforge/tests/data/backapps-website.json swh/lister/sourceforge/tests/data/backapps.json -swh/lister/sourceforge/tests/data/bzr-repo.json swh/lister/sourceforge/tests/data/main-sitemap.xml swh/lister/sourceforge/tests/data/mojunk.json swh/lister/sourceforge/tests/data/mramm.json +swh/lister/sourceforge/tests/data/ocaml-lpd.html +swh/lister/sourceforge/tests/data/ocaml-lpd.json swh/lister/sourceforge/tests/data/os3dmodels.json swh/lister/sourceforge/tests/data/random-mercurial.json swh/lister/sourceforge/tests/data/subsitemap-0.xml swh/lister/sourceforge/tests/data/subsitemap-1.xml +swh/lister/sourceforge/tests/data/t12eksandbox.html +swh/lister/sourceforge/tests/data/t12eksandbox.json swh/lister/tests/__init__.py swh/lister/tests/test_cli.py swh/lister/tests/test_pattern.py swh/lister/tests/test_utils.py swh/lister/tuleap/__init__.py swh/lister/tuleap/lister.py swh/lister/tuleap/tasks.py swh/lister/tuleap/tests/__init__.py swh/lister/tuleap/tests/test_lister.py swh/lister/tuleap/tests/test_tasks.py swh/lister/tuleap/tests/data/https_tuleap.net/projects swh/lister/tuleap/tests/data/https_tuleap.net/repo_1 swh/lister/tuleap/tests/data/https_tuleap.net/repo_2 swh/lister/tuleap/tests/data/https_tuleap.net/repo_3 \ No newline at end of file diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt index cf7ec2b..6caae43 100644 --- a/swh.lister.egg-info/requires.txt +++ b/swh.lister.egg-info/requires.txt @@ -1,18 +1,19 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity>=6.2 xmltodict +lxml swh.core[db]>=0.9 swh.scheduler>=0.8 [testing] pytest pytest-mock requests_mock types-click types-pyyaml types-requests diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index c95a089..dcc30c3 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,424 +1,457 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from dataclasses import dataclass, field import datetime from enum import Enum import logging import re from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from xml.etree import ElementTree from bs4 import BeautifulSoup import iso8601 +import lxml import requests from tenacity.before_sleep import before_sleep_log from swh.core.api.classes import stream_results from swh.lister.utils import retry_policy_generic, throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) class VcsNames(Enum): """Used to filter SourceForge tool names for valid VCS types""" # CVS projects are read-only CVS = "cvs" GIT = "git" SUBVERSION = "svn" MERCURIAL = "hg" BAZAAR = "bzr" VCS_NAMES = set(v.value for v in VcsNames.__members__.values()) @dataclass class SourceForgeListerEntry: vcs: VcsNames url: str last_modified: datetime.date SubSitemapNameT = str ProjectNameT = str # SourceForge only offers day-level granularity, which is good enough for our purposes LastModifiedT = datetime.date @dataclass class SourceForgeListerState: """Current state of the SourceForge lister in incremental runs""" """If the subsitemap does not exist, we assume a full run of this subsitemap is needed. If the date is the same, we skip the subsitemap, otherwise we request the subsitemap and look up every project's "last modified" date to compare against `ListedOrigins` from the database.""" subsitemap_last_modified: Dict[SubSitemapNameT, LastModifiedT] = field( default_factory=dict ) """Some projects (not the majority, but still meaningful) have no VCS for us to archive. We need to remember a mapping of their API URL to their "last modified" date so we don't keep querying them needlessly every time.""" empty_projects: Dict[str, LastModifiedT] = field(default_factory=dict) SourceForgeListerPage = List[SourceForgeListerEntry] MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" # API resource endpoint for information about the given project. # # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe` # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" # Predictable URL for cloning (in the broad sense) a VCS registered for the project. # # Warning: does not apply to bzr repos, and Mercurial are http only, see use of this # constant below. # # `vcs`: VCS type, one of `VCS_NAMES` # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe`. # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses # `git` (https://git.code.sf.net/p/codeblocks/git). CLONE_URL_FORMAT = "https://{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" PROJ_URL_RE = re.compile( r"^https://sourceforge.net/(?P[^/]+)/(?P[^/]+)/(?P.*)?" ) # Mapping of `(namespace, project name)` to `last modified` date. ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT] class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): """List origins from the "SourceForge" forge.""" # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main", credentials=credentials, ) # Will hold the currently saved "last modified" dates to compare against our # requests. self._project_last_modified: Optional[ProjectsLastModifiedCache] = None self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) self.incremental = incremental def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: subsitemaps = { k: datetime.date.fromisoformat(v) for k, v in d.get("subsitemap_last_modified", {}).items() } empty_projects = { k: datetime.date.fromisoformat(v) for k, v in d.get("empty_projects", {}).items() } return SourceForgeListerState( subsitemap_last_modified=subsitemaps, empty_projects=empty_projects ) def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]: return { "subsitemap_last_modified": { k: v.isoformat() for k, v in state.subsitemap_last_modified.items() }, "empty_projects": { k: v.isoformat() for k, v in state.empty_projects.items() }, } def projects_last_modified(self) -> ProjectsLastModifiedCache: if not self.incremental: # No point in loading the previous results if we're doing a full run return {} if self._project_last_modified is not None: return self._project_last_modified # We know there will be at least that many origins stream = stream_results( self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000 ) listed_origins = dict() # Projects can have slashes in them if they're subprojects, but the # mointpoint (last component) cannot. url_match = re.compile( r".*\.code\.sf\.net/(?P[^/]+)/(?P.+)/.*" ) bzr_url_match = re.compile( - r"http://(?P[^/]+).bzr.sourceforge.net/bzrroot/([^/]+)" + r"http://(?P[^/]+).bzr.sourceforge.net/bzr/([^/]+)" ) cvs_url_match = re.compile( r"rsync://a.cvs.sourceforge.net/cvsroot/(?P.+)/([^/]+)" ) for origin in stream: url = origin.url match = url_match.match(url) if match is None: # Could be a bzr or cvs special endpoint bzr_match = bzr_url_match.match(url) cvs_match = cvs_url_match.match(url) matches = None if bzr_match is not None: matches = bzr_match.groupdict() elif cvs_match is not None: matches = cvs_match.groupdict() assert matches project = matches["project"] namespace = "p" # no special namespacing for bzr and cvs projects else: matches = match.groupdict() namespace = matches["namespace"] project = matches["project"] # "Last modified" dates are the same across all VCS (tools, even) # within a project or subproject. An assertion here would be overkill. last_modified = origin.last_update assert last_modified is not None listed_origins[(namespace, project)] = last_modified.date() self._project_last_modified = listed_origins return listed_origins @throttling_retry( retry=retry_policy_generic, before_sleep=before_sleep_log(logger, logging.WARNING), ) def page_request(self, url, params) -> requests.Response: # Log listed URL to ease debugging logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: # Log response content to ease debugging logger.warning( "Unexpected HTTP status code %s for URL %s", response.status_code, response.url, ) # The lister must fail on blocking errors response.raise_for_status() return response def get_pages(self) -> Iterator[SourceForgeListerPage]: """ SourceForge has a main XML sitemap that lists its sharded sitemaps for all projects. Each XML sub-sitemap lists project pages, which are not unique per project: a project can have a wiki, a home, a git, an svn, etc. For each unique project, we query an API endpoint that lists (among other things) the tools associated with said project, some of which are the VCS used. Subprojects are considered separate projects. Lastly we use the information of which VCS are used to build the predictable clone URL for any given VCS. """ sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text tree = ElementTree.fromstring(sitemap_contents) for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_el is not None and last_modified_el.text is not None last_modified = datetime.date.fromisoformat(last_modified_el.text) location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None and location.text is not None sub_url = location.text if self.incremental: recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) if recorded_last_mod == last_modified: # The entire subsitemap hasn't changed, so none of its projects # have either, skip it. continue self.state.subsitemap_last_modified[sub_url] = last_modified subsitemap_contents = self.page_request(sub_url, {}).text subtree = ElementTree.fromstring(subsitemap_contents) yield from self._get_pages_from_subsitemap(subtree) def get_origins_from_page( self, page: SourceForgeListerPage ) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for hit in page: last_modified: str = str(hit.last_modified) last_update: datetime.datetime = iso8601.parse_date(last_modified) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=hit.vcs.value, url=hit.url, last_update=last_update, ) def _get_pages_from_subsitemap( self, subtree: ElementTree.Element ) -> Iterator[SourceForgeListerPage]: projects: Set[ProjectNameT] = set() for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_block is not None last_modified = last_modified_block.text location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None project_url = location.text assert project_url is not None match = PROJ_URL_RE.match(project_url) if match: matches = match.groupdict() namespace = matches["namespace"] if namespace == "projects": # These have a `p`-namespaced counterpart, use that instead continue project = matches["project"] rest = matches["rest"] if rest.count("/") > 1: # This is a subproject. There exists no sub-subprojects. subproject_name = rest.rsplit("/", 2)[0] project = f"{project}/{subproject_name}" prev_len = len(projects) projects.add(project) if prev_len == len(projects): # Already seen continue pages = self._get_pages_for_project(namespace, project, last_modified) if pages: yield pages else: logger.debug("Project '%s' does not have any VCS", project) else: # Should almost always match, let's log it # The only ones that don't match are mostly specialized one-off URLs. msg = "Project URL '%s' does not match expected pattern" logger.warning(msg, project_url) def _get_pages_for_project( self, namespace, project, last_modified ) -> SourceForgeListerPage: endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) empty_project_last_modified = self.state.empty_projects.get(endpoint) if empty_project_last_modified is not None: if last_modified == empty_project_last_modified.isoformat(): # Project has not changed, so is still empty, meaning it has # no VCS attached that we can archive. logger.debug(f"Project {namespace}/{project} is still empty") return [] if self.incremental: expected = self.projects_last_modified().get((namespace, project)) if expected is not None: if expected.isoformat() == last_modified: # Project has not changed logger.debug(f"Project {namespace}/{project} has not changed") return [] else: logger.debug(f"Project {namespace}/{project} was updated") else: msg = "New project during an incremental run: %s/%s" logger.debug(msg, namespace, project) try: res = self.page_request(endpoint, {}).json() except requests.HTTPError: # We've already logged in `page_request` return [] tools = res.get("tools") if tools is None: # This rarely happens, on very old URLs logger.warning("Project '%s' does not have any tools", endpoint) return [] hits = [] for tool in tools: tool_name = tool["name"] if tool_name not in VCS_NAMES: continue if tool_name == VcsNames.CVS.value: # CVS projects are different from other VCS ones, they use the rsync # protocol, a list of modules needs to be fetched from an info page # and multiple origin URLs can be produced for a same project. cvs_info_url = f"http://{project}.cvs.sourceforge.net" try: response = self.page_request(cvs_info_url, params={}) except requests.HTTPError: logger.warning( "CVS info page could not be fetched, skipping project '%s'", project, ) continue else: bs = BeautifulSoup(response.text, features="html.parser") cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot" for text in [b.text for b in bs.find_all("b")]: match = re.search(rf".*/cvsroot/{project} co -P (.+)", text) if match is not None: module = match.group(1) - url = f"{cvs_base_url}/{project}/{module}" - hits.append( - SourceForgeListerEntry( - vcs=VcsNames(tool_name), - url=url, - last_modified=last_modified, + if module != "Attic": + url = f"{cvs_base_url}/{project}/{module}" + hits.append( + SourceForgeListerEntry( + vcs=VcsNames(tool_name), + url=url, + last_modified=last_modified, + ) ) - ) continue url = CLONE_URL_FORMAT.format( vcs=tool_name, namespace=namespace, project=project, mount_point=tool["mount_point"], ) if tool_name == VcsNames.MERCURIAL.value: # SourceForge does not yet support anonymous HTTPS cloning for Mercurial # See https://sourceforge.net/p/forge/feature-requests/727/ url = url.replace("https://", "http://") if tool_name == VcsNames.BAZAAR.value: # SourceForge has removed support for bzr and only keeps legacy projects # around at a separate (also not https) URL. Bzr projects are very rare # and a lot of them are 404 now. - url = f"http://{project}.bzr.sourceforge.net/bzrroot/{project}" + url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" + try: + response = self.page_request(url, params={}) + if "To get this branch, use:" not in response.text: + # If a bzr project has multiple branches, we need to extract their + # names from the repository landing page and create one listed origin + # per branch + parser = lxml.etree.HTMLParser() + tree = lxml.etree.fromstring(response.text, parser) + + # Get all tds with class 'autcell' + tds = tree.xpath(".//td[contains(@class, 'autcell')]") + for td in tds: + branch = td.findtext("a") + # If the td's parent contains Branch and + # it has non-empty text: + if td.xpath("..//img[@alt='Branch']") and branch: + hits.append( + SourceForgeListerEntry( + vcs=VcsNames(tool_name), + url=f"{url}/{branch}", + last_modified=last_modified, + ) + ) + continue + except requests.HTTPError: + logger.warning( + "Bazaar repository page could not be fetched, skipping project '%s'", + project, + ) + continue entry = SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified ) hits.append(entry) if not hits: date = datetime.date.fromisoformat(last_modified) self.state.empty_projects[endpoint] = date else: self.state.empty_projects.pop(endpoint, None) return hits diff --git a/swh/lister/sourceforge/tests/data/aaron.html b/swh/lister/sourceforge/tests/data/aaron.html index 5b1c226..7b9d5f4 100644 --- a/swh/lister/sourceforge/tests/data/aaron.html +++ b/swh/lister/sourceforge/tests/data/aaron.html @@ -1,23 +1,24 @@ CVS Info for project aaron

The aaron project's CVS data is in read-only mode, so the project may have switched over to another source-code-management system. To check, visit the Project Summary Page for aaron and see if the menubar lists a newer code repository, such as SVN or Git.

The CVS data can be accessed as follows. You can run a per-module CVS checkout via pserver protocol:

  • cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P aaron
  • cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P www
  • +
  • cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P Attic
  • You can view a list of files or copy all the CVS repository data via rsync (the 1st command lists the files, the 2nd copies):

  • rsync -a a.cvs.sourceforge.net::cvsroot/aaron/
  • rsync -ai a.cvs.sourceforge.net::cvsroot/aaron/ /my/local/dest/dir/
  • If you are a project admin for aaron, you can request that this page redirect to another repo on your project by submitting a support request. diff --git a/swh/lister/sourceforge/tests/data/bzr-repo.json b/swh/lister/sourceforge/tests/data/bzr-repo.json deleted file mode 100644 index 380e8e6..0000000 --- a/swh/lister/sourceforge/tests/data/bzr-repo.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "shortname": "bzr-repo", - "name": "Bazaar repo", - "_id": "4bf3fc291be1ce2f10000052", - "url": "https://sourceforge.net/p/bzr-repo/", - "private": false, - "short_description": "This is an example bzr project", - "creation_date": "2009-10-10", - "summary": "", - "external_homepage": "", - "video_url": "", - "socialnetworks": [], - "status": "active", - "moved_to_url": "", - "preferred_support_tool": "", - "preferred_support_url": "", - "developers": [ - { - "username": "Alphare", - "name": "Raphaël Gomès", - "url": "https://sourceforge.net/u/alphare/" - } - ], - "tools": [ - { - "name": "bzr", - "mount_point": "bzr", - "url": "/p/bzr-repo/bazaar/", - "icons": { - "24": "images/code_24.png", - "32": "images/code_32.png", - "48": "images/code_48.png" - }, - "installable": true, - "tool_label": "Bazaar", - "mount_label": "Bazaar" - } - ], - "labels": [], - "categories": { - "audience": [], - "developmentstatus": [], - "environment": [], - "language": [], - "license": [], - "translation": [], - "os": [], - "database": [], - "topic": [] - }, - "icon_url": null, - "screenshots": [] - } diff --git a/swh/lister/sourceforge/tests/data/ocaml-lpd.html b/swh/lister/sourceforge/tests/data/ocaml-lpd.html new file mode 100644 index 0000000..c313505 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/ocaml-lpd.html @@ -0,0 +1,106 @@ + + + +SourceForge: Browsing /ocaml-lpd + + + + + + + + +

    +
    +

    +Browsing + +(root)/ocaml-lpd + +

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FilenameLatest RevLast Changed
    + + +.. +
    + +Folder + + +backup.bzr.~1~
    + +Branch + + +trunk +13 +2011-04-17 22:02:29
    +
    +
    +
    +
    +
    + +
    +Help +
    +
    +
    + + \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/ocaml-lpd.json b/swh/lister/sourceforge/tests/data/ocaml-lpd.json new file mode 100644 index 0000000..bf2607b --- /dev/null +++ b/swh/lister/sourceforge/tests/data/ocaml-lpd.json @@ -0,0 +1,201 @@ +{ + "shortname": "ocaml-lpd", + "name": "Lpd OCaml library", + "_id": "50c63c70e88f3d0bf07d4c6d", + "url": "https://sourceforge.net/p/ocaml-lpd/", + "private": false, + "short_description": "OCaml Lpd is a Line Printer Daemon (LPD) server library written in OCaml. This project moved to OCamlForge https://forge.ocamlcore.org/projects/lpd/", + "creation_date": "2005-02-23", + "summary": "", + "external_homepage": "http://lpd.forge.ocamlcore.org/", + "video_url": "", + "socialnetworks": [], + "status": "moved", + "moved_to_url": "https://forge.ocamlcore.org/projects/lpd/", + "preferred_support_tool": "", + "preferred_support_url": "", + "developers": [ + { + "username": "chris_77", + "name": "ChriS", + "url": "https://sourceforge.net/u/chris_77/" + } + ], + "tools": [ + { + "name": "files-sf", + "mount_point": "files", + "url": "/p/ocaml-lpd/files/", + "icons": { + "24": "images/downloads_24.png", + "32": "images/downloads_32.png", + "48": "images/downloads_48.png" + }, + "installable": false, + "tool_label": "Files", + "mount_label": "Files" + }, + { + "name": "mailman", + "mount_point": "mailman", + "url": "/p/ocaml-lpd/mailman/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": false, + "tool_label": "Mailing Lists", + "mount_label": "Mailing Lists" + }, + { + "name": "bzr", + "mount_point": "code", + "url": "/p/ocaml-lpd/code/", + "icons": { + "24": "images/code_24.png", + "32": "images/code_32.png", + "48": "images/code_48.png" + }, + "installable": false, + "tool_label": "BZR", + "mount_label": "Code" + }, + { + "name": "summary", + "mount_point": "summary", + "url": "/p/ocaml-lpd/summary/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Summary", + "mount_label": "Summary", + "sourceforge_group_id": 132212 + }, + { + "name": "wiki", + "mount_point": "wiki", + "url": "/p/ocaml-lpd/wiki/", + "icons": { + "24": "images/wiki_24.png", + "32": "images/wiki_32.png", + "48": "images/wiki_48.png" + }, + "installable": true, + "tool_label": "Wiki", + "mount_label": "Wiki" + }, + { + "name": "reviews", + "mount_point": "reviews", + "url": "/p/ocaml-lpd/reviews/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Reviews", + "mount_label": "Reviews" + }, + { + "name": "support", + "mount_point": "support", + "url": "/p/ocaml-lpd/support/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Support", + "mount_label": "Support" + }, + { + "name": "activity", + "mount_point": "activity", + "url": "/p/ocaml-lpd/activity/", + "icons": { + "24": "images/admin_24.png", + "32": "images/admin_32.png", + "48": "images/admin_48.png" + }, + "installable": false, + "tool_label": "Tool", + "mount_label": "Activity" + } + ], + "labels": [], + "categories": { + "audience": [ + { + "id": 3, + "shortname": "developers", + "fullname": "Developers", + "fullpath": "Intended Audience :: by End-User Class :: Developers" + } + ], + "developmentstatus": [ + { + "id": 11, + "shortname": "production", + "fullname": "5 - Production/Stable", + "fullpath": "Development Status :: 5 - Production/Stable" + } + ], + "environment": [ + { + "id": 238, + "shortname": "daemon", + "fullname": "Non-interactive (Daemon)", + "fullpath": "User Interface :: Non-interactive (Daemon)" + } + ], + "language": [ + { + "id": 454, + "shortname": "ocaml", + "fullname": "OCaml (Objective Caml)", + "fullpath": "Programming Language :: OCaml (Objective Caml)" + } + ], + "license": [ + { + "id": 16, + "shortname": "lgpl", + "fullname": "GNU Library or Lesser General Public License version 2.0 (LGPLv2)", + "fullpath": "License :: OSI-Approved Open Source :: GNU Library or Lesser General Public License version 2.0 (LGPLv2)" + } + ], + "translation": [ + { + "id": 275, + "shortname": "english", + "fullname": "English", + "fullpath": "Translations :: English" + } + ], + "os": [ + { + "id": 436, + "shortname": "os_portable", + "fullname": "OS Portable (Source code to work with many OS platforms)", + "fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Portable (Source code to work with many OS platforms)" + } + ], + "database": [], + "topic": [ + { + "id": 154, + "shortname": "printing", + "fullname": "Printing", + "fullpath": "Topic :: Printing" + } + ] + }, + "icon_url": null, + "screenshots": [] +} \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/subsitemap-1.xml b/swh/lister/sourceforge/tests/data/subsitemap-1.xml index 290800b..e3f9c0d 100644 --- a/swh/lister/sourceforge/tests/data/subsitemap-1.xml +++ b/swh/lister/sourceforge/tests/data/subsitemap-1.xml @@ -1,48 +1,53 @@ https://sourceforge.net/projects/backapps/files/ 2021-02-11 daily https://sourceforge.net/p/backapps/tickets/ 2021-02-11 daily https://sourceforge.net/p/backapps/chat/ 2021-02-11 daily https://sourceforge.net/p/backapps/website/files/ 2021-02-11 daily https://sourceforge.net/p/backapps/website/tickets/ 2021-02-11 daily https://sourceforge.net/projects/mojunk/files/ 2017-12-31 daily https://sourceforge.net/p/mojunk/home/ 2017-12-31 daily https://sourceforge.net/p/random-mercurial/ 2019-05-02 daily - https://sourceforge.net/p/bzr-repo/ - 2021-01-27 + https://sourceforge.net/p/t12eksandbox/ + 2011-02-09 + daily + + + https://sourceforge.net/p/ocaml-lpd/ + 2011-04-17 daily diff --git a/swh/lister/sourceforge/tests/data/t12eksandbox.html b/swh/lister/sourceforge/tests/data/t12eksandbox.html new file mode 100644 index 0000000..790cabd --- /dev/null +++ b/swh/lister/sourceforge/tests/data/t12eksandbox.html @@ -0,0 +1,274 @@ + + + + + +/t12eksandbox : changes + + + + + + + + + +
    + +
    + + +
    + +
    + + +RSS + + +
    +
    + + + +
    +
    + +

    + + + +(root)/t12eksandbox + + + +: changes + +from revision +4 + + + +

    + +
    +
    +To get this branch, use:
    +bzr branch +http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox +
    + + + + +

    +expand all expand all +

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Rev SummaryAuthorsDateDiffFiles
    +
    + + + +
    +
    +
    + +Commit! +
    + +
    ctsai at sourceforge +2011-02-09 + +Diff +Files +
    +
    + + + +
    +
    +
    + +fdsa +
    + +
    ctsai at sourceforge +2010-02-03 + +Diff +Files +
    +
    + + + +
    +
    +
    + +fdsa +
    + +
    ctsai at sourceforge +2009-10-12 + +Diff +Files +
    +
    + + + +
    +
    + + +ctsai at sourceforge +2009-10-12 + +Diff +Files +
    + +
    +

    Loggerhead 1.18.1 is a web-based interface for Bazaar branches

    +
    +
    +
    +
    +
    + +
    +Help +
    +
    +
    + + \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/t12eksandbox.json b/swh/lister/sourceforge/tests/data/t12eksandbox.json new file mode 100644 index 0000000..df379c9 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/t12eksandbox.json @@ -0,0 +1,292 @@ +{ + "shortname": "t12eksandbox", + "name": "t12ek sandbox", + "_id": "5304cd2634309d109fc1dec5", + "url": "https://sourceforge.net/p/t12eksandbox/", + "private": false, + "short_description": "Sandboxes are for playing in... Note: this is an SF.net staff's test project. Don't expect to find real files here. Update test!\r\nLine 2!\r\nupdate 2012-06-05", + "creation_date": "2009-07-14", + "summary": "", + "external_homepage": "http://t12eksandbox.sourceforge.net", + "video_url": "", + "socialnetworks": [], + "status": "active", + "moved_to_url": "", + "preferred_support_tool": "_url", + "preferred_support_url": "http://sourceforge.net/tracker/?func=add&group_id=269579&atid=1146768", + "developers": [ + { + "username": "sillygoose", + "name": "sillygoose", + "url": "https://sourceforge.net/u/sillygoose/" + }, + { + "username": "thimsmith", + "name": "Tim Siegel", + "url": "https://sourceforge.net/u/thimsmith/" + } + ], + "tools": [ + { + "name": "reviews", + "mount_point": "reviews", + "url": "/p/t12eksandbox/reviews/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Reviews", + "mount_label": "Reviews" + }, + { + "name": "summary", + "mount_point": "summary", + "url": "/p/t12eksandbox/summary/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Summary", + "mount_label": "Summary", + "sourceforge_group_id": 269579 + }, + { + "name": "mailman", + "mount_point": "mailman", + "url": "/p/t12eksandbox/mailman/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": false, + "tool_label": "Mailing Lists", + "mount_label": "Mailing Lists" + }, + { + "name": "support", + "mount_point": "support", + "url": "/p/t12eksandbox/support/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Support", + "mount_label": "Support" + }, + { + "name": "files-sf", + "mount_point": "files", + "url": "/p/t12eksandbox/files/", + "icons": { + "24": "images/downloads_24.png", + "32": "images/downloads_32.png", + "48": "images/downloads_48.png" + }, + "installable": false, + "tool_label": "Files", + "mount_label": "Files" + }, + { + "name": "wiki", + "mount_point": "wiki", + "url": "/p/t12eksandbox/wiki/", + "icons": { + "24": "images/wiki_24.png", + "32": "images/wiki_32.png", + "48": "images/wiki_48.png" + }, + "installable": true, + "tool_label": "Wiki", + "mount_label": "Wiki" + }, + { + "name": "blog", + "mount_point": "news", + "url": "/p/t12eksandbox/news/", + "icons": { + "24": "images/blog_24.png", + "32": "images/blog_32.png", + "48": "images/blog_48.png" + }, + "installable": true, + "tool_label": "Blog", + "mount_label": "News" + }, + { + "name": "bzr", + "mount_point": "bazaar", + "url": "/p/t12eksandbox/bazaar/", + "icons": { + "24": "images/code_24.png", + "32": "images/code_32.png", + "48": "images/code_48.png" + }, + "installable": false, + "tool_label": "BZR", + "mount_label": "Bazaar" + }, + { + "name": "discussion", + "mount_point": "discussion", + "url": "/p/t12eksandbox/discussion/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": true, + "tool_label": "Discussion", + "mount_label": "Discussion" + }, + { + "name": "tickets", + "mount_point": "support-requests", + "url": "/p/t12eksandbox/support-requests/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Support Requests" + }, + { + "name": "tickets", + "mount_point": "feature-requests", + "url": "/p/t12eksandbox/feature-requests/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Feature Requests" + }, + { + "name": "link", + "mount_point": "donate", + "url": "/p/t12eksandbox/donate/", + "icons": { + "24": "images/ext_24.png", + "32": "images/ext_32.png", + "48": "images/ext_48.png" + }, + "installable": true, + "tool_label": "External Link", + "mount_label": "Donate" + }, + { + "name": "tickets", + "mount_point": "patches", + "url": "/p/t12eksandbox/patches/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Patches" + }, + { + "name": "tickets", + "mount_point": "bugs", + "url": "/p/t12eksandbox/bugs/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Bugs" + }, + { + "name": "activity", + "mount_point": "activity", + "url": "/p/t12eksandbox/activity/", + "icons": { + "24": "images/admin_24.png", + "32": "images/admin_32.png", + "48": "images/admin_48.png" + }, + "installable": false, + "tool_label": "Tool", + "mount_label": "Activity" + } + ], + "labels": [], + "categories": { + "audience": [], + "developmentstatus": [ + { + "id": 10, + "shortname": "beta", + "fullname": "4 - Beta", + "fullpath": "Development Status :: 4 - Beta" + }, + { + "id": 7, + "shortname": "planning", + "fullname": "1 - Planning", + "fullpath": "Development Status :: 1 - Planning" + } + ], + "environment": [], + "language": [], + "license": [ + { + "id": 196, + "shortname": "other", + "fullname": "Other License", + "fullpath": "License :: Other License" + } + ], + "translation": [], + "os": [], + "database": [ + { + "id": 524, + "shortname": "db_net_mysql", + "fullname": "MySQL", + "fullpath": "Database Environment :: Network-based DBMS :: MySQL" + } + ], + "topic": [ + { + "id": 575, + "shortname": "testing", + "fullname": "Testing", + "fullpath": "Topic :: Software Development :: Testing" + }, + { + "id": 97, + "shortname": "scientific", + "fullname": "Scientific/Engineering", + "fullpath": "Topic :: Scientific/Engineering" + } + ] + }, + "icon_url": null, + "screenshots": [ + { + "url": "https://sourceforge.net/p/t12eksandbox/screenshot/224498.jpg", + "thumbnail_url": "https://sourceforge.net/p/t12eksandbox/screenshot/224498.jpg/thumb", + "caption": "aimage2" + }, + { + "url": "https://sourceforge.net/p/t12eksandbox/screenshot/224496.jpg", + "thumbnail_url": "https://sourceforge.net/p/t12eksandbox/screenshot/224496.jpg/thumb", + "caption": "3Kimage3" + } + ] +} \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py index a288ce6..516f562 100644 --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -1,463 +1,536 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import functools import json from pathlib import Path import re from iso8601 import iso8601 import pytest from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.sourceforge.lister import ( MAIN_SITEMAP_URL, PROJECT_API_URL_FORMAT, SourceForgeLister, SourceForgeListerState, ) from swh.lister.tests.test_utils import assert_sleep_calls from swh.lister.utils import WAIT_EXP_BASE # Mapping of project name to namespace from swh.scheduler.model import ListedOrigin TEST_PROJECTS = { "aaron": "p", "adobexmp": "adobe", "backapps": "p", "backapps/website": "p", "bzr-repo": "p", "mojunk": "p", "mramm": "p", "os3dmodels": "p", "random-mercurial": "p", + "t12eksandbox": "p", + "ocaml-lpd": "p", } URLS_MATCHER = { PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project): project for project, namespace in TEST_PROJECTS.items() } def get_main_sitemap(datadir): return Path(datadir, "main-sitemap.xml").read_text() def get_subsitemap_0(datadir): return Path(datadir, "subsitemap-0.xml").read_text() def get_subsitemap_1(datadir): return Path(datadir, "subsitemap-1.xml").read_text() def get_project_json(datadir, request, context): url = request.url project = URLS_MATCHER.get(url) assert project is not None, f"Url '{url}' could not be matched" project = project.replace("/", "-") return json.loads(Path(datadir, f"{project}.json").read_text()) def get_cvs_info_page(datadir): return Path(datadir, "aaron.html").read_text() +def get_bzr_repo_page(datadir, repo_name): + return Path(datadir, f"{repo_name}.html").read_text() + + def _check_request_headers(request): return request.headers.get("User-Agent") == USER_AGENT def _check_listed_origins(lister, swh_scheduler): scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} assert res == { "https://svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), "https://git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), "https://svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), "https://git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), "https://git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), "https://svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), "https://git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), - "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), + "http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox": ( + "bzr", + "2011-02-09", + ), + "http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk": ( + "bzr", + "2011-04-17", + ), "rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"), "rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"), } def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): """ Simulate a full listing of an artificially restricted sourceforge. There are 5 different projects, spread over two sub-sitemaps, a few of which have multiple VCS listed, one has none, one is outside of the standard `/p/` namespace, some with custom mount points. All non-interesting but related entries have been kept. """ lister = SourceForgeLister(scheduler=swh_scheduler) requests_mock.get( MAIN_SITEMAP_URL, text=get_main_sitemap(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-0.xml", text=get_subsitemap_0(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-1.xml", text=get_subsitemap_1(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("https://sourceforge.net/rest/.*"), json=functools.partial(get_project_json, datadir), additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("http://aaron.cvs.sourceforge.net/"), text=get_cvs_info_page(datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) stats = lister.run() # - os3dmodels (2 repos), # - mramm (3 repos), # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). - # - bzr-repo (1 repo). + # - t12eksandbox (1 repo). + # - ocaml-lpd (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 7 - assert stats.origins == 13 + assert stats.pages == 8 + assert stats.origins == 14 expected_state = { "subsitemap_last_modified": { "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", "https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18", }, "empty_projects": { "https://sourceforge.net/rest/p/backapps": "2021-02-11", "https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17", }, } assert lister.state_to_dict(lister.state) == expected_state _check_listed_origins(lister, swh_scheduler) def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, mocker): """ Simulate an incremental listing of an artificially restricted sourceforge. Same dataset as the full run, because it's enough to validate the different cases. """ lister = SourceForgeLister(scheduler=swh_scheduler, incremental=True) requests_mock.get( MAIN_SITEMAP_URL, text=get_main_sitemap(datadir), additional_matcher=_check_request_headers, ) def not_called(request, *args, **kwargs): raise AssertionError(f"Should not have been called: '{request.url}'") requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-0.xml", text=get_subsitemap_0(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-1.xml", text=not_called, additional_matcher=_check_request_headers, ) def filtered_get_project_json(request, context): # These projects should not be requested again assert URLS_MATCHER[request.url] not in {"adobe", "mojunk"} return get_project_json(datadir, request, context) requests_mock.get( re.compile("https://sourceforge.net/rest/.*"), json=filtered_get_project_json, additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("http://aaron.cvs.sourceforge.net/"), text=get_cvs_info_page(datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) + faked_listed_origins = [ # mramm: changed ListedOrigin( lister_id=lister.lister_obj.id, visit_type="git", url="https://git.code.sf.net/p/mramm/files", last_update=iso8601.parse_date("2019-01-01"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="git", url="https://git.code.sf.net/p/mramm/git", last_update=iso8601.parse_date("2019-01-01"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="svn", url="https://svn.code.sf.net/p/mramm/svn", last_update=iso8601.parse_date("2019-01-01"), ), # stayed the same, even though its subsitemap has changed ListedOrigin( lister_id=lister.lister_obj.id, visit_type="git", url="https://git.code.sf.net/p/os3dmodels/git", last_update=iso8601.parse_date("2017-03-31"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="svn", url="https://svn.code.sf.net/p/os3dmodels/svn", last_update=iso8601.parse_date("2017-03-31"), ), # others: stayed the same, should be skipped ListedOrigin( lister_id=lister.lister_obj.id, visit_type="git", url="https://git.code.sf.net/p/mojunk/git", last_update=iso8601.parse_date("2017-12-31"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="git", url="https://git.code.sf.net/p/mojunk/git2", last_update=iso8601.parse_date("2017-12-31"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="svn", url="https://svn.code.sf.net/p/mojunk/svn", last_update=iso8601.parse_date("2017-12-31"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="svn", url="https://svn.code.sf.net/p/backapps/website/code", last_update=iso8601.parse_date("2021-02-11"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="hg", url="http://hg.code.sf.net/p/random-mercurial/hg", last_update=iso8601.parse_date("2019-05-02"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="bzr", - url="http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo", - last_update=iso8601.parse_date("2021-01-27"), + url="http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox", + last_update=iso8601.parse_date("2011-02-09"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="bzr", + url="http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk", + last_update=iso8601.parse_date("2011-04-17"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="cvs", url="rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron", last_update=iso8601.parse_date("2013-03-07"), ), ListedOrigin( lister_id=lister.lister_obj.id, visit_type="cvs", url="rsync://a.cvs.sourceforge.net/cvsroot/aaron/www", last_update=iso8601.parse_date("2013-03-07"), ), ] swh_scheduler.record_listed_origins(faked_listed_origins) to_date = datetime.date.fromisoformat faked_state = SourceForgeListerState( subsitemap_last_modified={ # changed "https://sourceforge.net/allura_sitemap/sitemap-0.xml": to_date( "2021-02-18" ), # stayed the same "https://sourceforge.net/allura_sitemap/sitemap-1.xml": to_date( "2021-03-18" ), }, empty_projects={ "https://sourceforge.net/rest/p/backapps": to_date("2020-02-11"), "https://sourceforge.net/rest/adobe/adobexmp": to_date("2017-10-17"), }, ) lister.state = faked_state stats = lister.run() # - mramm (3 repos), # changed assert stats.pages == 1 assert stats.origins == 3 expected_state = { "subsitemap_last_modified": { "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", "https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18", }, "empty_projects": { "https://sourceforge.net/rest/p/backapps": "2021-02-11", # changed "https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17", }, } assert lister.state_to_dict(lister.state) == expected_state # origins have been updated _check_listed_origins(lister, swh_scheduler) def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir): lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep") requests_mock.get( MAIN_SITEMAP_URL, [ {"status_code": 429}, {"status_code": 429}, {"text": get_main_sitemap(datadir)}, ], additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-0.xml", [{"status_code": 429}, {"text": get_subsitemap_0(datadir), "status_code": 301}], additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-1.xml", [{"status_code": 429}, {"text": get_subsitemap_1(datadir)}], additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("https://sourceforge.net/rest/.*"), [{"status_code": 429}, {"json": functools.partial(get_project_json, datadir)}], additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("http://aaron.cvs.sourceforge.net/"), text=get_cvs_info_page(datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) + stats = lister.run() # - os3dmodels (2 repos), # - mramm (3 repos), # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). - # - bzr-repo (1 repo). + # - t12eksandbox (1 repo). + # - ocaml-lpd (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 7 - assert stats.origins == 13 + assert stats.pages == 8 + assert stats.origins == 14 _check_listed_origins(lister, swh_scheduler) # Test `time.sleep` is called with exponential retries assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1]) @pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404]) def test_sourceforge_lister_http_error( swh_scheduler, requests_mock, status_code, mocker ): lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep") requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code) with pytest.raises(HTTPError): lister.run() exp_retries = [] if status_code >= 500: exp_retries = [1.0, 10.0, 100.0, 1000.0] assert_sleep_calls(mocker, mocked_sleep, exp_retries) @pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404]) def test_sourceforge_lister_project_error( datadir, swh_scheduler, requests_mock, status_code, mocker ): lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep mocker.patch.object(lister.page_request.retry, "sleep") requests_mock.get( MAIN_SITEMAP_URL, text=get_main_sitemap(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-0.xml", text=get_subsitemap_0(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-1.xml", text=get_subsitemap_1(datadir), additional_matcher=_check_request_headers, ) # Request mocks precedence is LIFO requests_mock.get( re.compile("https://sourceforge.net/rest/.*"), json=functools.partial(get_project_json, datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) # Make all `mramm` requests fail # `mramm` is in subsitemap 0, which ensures we keep listing after an error. requests_mock.get( re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code ) # Make request to CVS info page fail requests_mock.get( re.compile("http://aaron.cvs.sourceforge.net/"), status_code=status_code ) stats = lister.run() # - os3dmodels (2 repos), # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). - # - bzr-repo (1 repo). + # - t12eksandbox (1 repo). + # - ocaml-lpd (1 repo). # adobe and backapps itself have no repos. # Did *not* list mramm - assert stats.pages == 5 - assert stats.origins == 8 + assert stats.pages == 6 + assert stats.origins == 9 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} # Ensure no `mramm` origins are listed, but all others are. assert res == { "https://svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), "https://git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), "https://svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), "https://git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), - "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), + "http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox": ( + "bzr", + "2011-02-09", + ), + "http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk": ( + "bzr", + "2011-04-17", + ), }