diff --git a/PKG-INFO b/PKG-INFO
index 487ccb8..e4b3dfd 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,125 +1,126 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 4.2.0
+Version: 4.3.0
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.golang`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 - `swh.lister.gogs`
+- `swh.liser.fedora`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
diff --git a/README.md b/README.md
index f54483f..4b89ee6 100644
--- a/README.md
+++ b/README.md
@@ -1,104 +1,105 @@
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.golang`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 - `swh.lister.gogs`
+- `swh.liser.fedora`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
diff --git a/mypy.ini b/mypy.ini
index 286fec0..7f9436b 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,50 +1,56 @@
 [mypy]
 namespace_packages = True
 warn_unused_ignores = True
 
 # 3rd party libraries without stubs (yet)
 
 [mypy-bs4.*]
 ignore_missing_imports = True
 
 [mypy-celery.*]
 ignore_missing_imports = True
 
 [mypy-debian.*]
 ignore_missing_imports = True
 
 [mypy-iso8601.*]
 ignore_missing_imports = True
 
 [mypy-launchpadlib.*]
 ignore_missing_imports = True
 
 [mypy-lazr.*]
 ignore_missing_imports = True
 
 [mypy-lxml.*]
 ignore_missing_imports = True
 
 [mypy-pkg_resources.*]
 ignore_missing_imports = True
 
 [mypy-pytest.*]
 ignore_missing_imports = True
 
 [mypy-pytest_postgresql.*]
 ignore_missing_imports = True
 
 [mypy-requests_mock.*]
 ignore_missing_imports = True
 
 [mypy-urllib3.util.*]
 ignore_missing_imports = True
 
 [mypy-dulwich.*]
 ignore_missing_imports = True
 
 [mypy-testing.postgresql.*]
 ignore_missing_imports = True
 
 [mypy-psycopg2.*]
 ignore_missing_imports = True
+
+[mypy-repomd.*]
+ignore_missing_imports = True
+
+[mypy-defusedxml.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
index 17a1e8f..2614f0a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,12 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
 tenacity >= 6.2
 lxml
 dulwich
 testing.postgresql
 psycopg2
+repomd
diff --git a/setup.py b/setup.py
index 7c55f6c..92a0272 100755
--- a/setup.py
+++ b/setup.py
@@ -1,103 +1,104 @@
 #!/usr/bin/env python3
 # Copyright (C) 2015-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from io import open
 from os import path
 
 from setuptools import find_packages, setup
 
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
 with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 
 def parse_requirements(name=None):
     if name:
         reqf = "requirements-%s.txt" % name
     else:
         reqf = "requirements.txt"
 
     requirements = []
     if not path.exists(reqf):
         return requirements
 
     with open(reqf) as f:
         for line in f.readlines():
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
             requirements.append(line)
     return requirements
 
 
 setup(
     name="swh.lister",
     description="Software Heritage lister",
     long_description=long_description,
     long_description_content_type="text/markdown",
     python_requires=">=3.7",
     author="Software Heritage developers",
     author_email="swh-devel@inria.fr",
     url="https://forge.softwareheritage.org/diffusion/DLSGH/",
     packages=find_packages(),
     install_requires=parse_requirements() + parse_requirements("swh"),
     tests_require=parse_requirements("test"),
     setup_requires=["setuptools-scm"],
     extras_require={"testing": parse_requirements("test")},
     use_scm_version=True,
     include_package_data=True,
     entry_points="""
         [swh.cli.subcommands]
         lister=swh.lister.cli
         [swh.workers]
         lister.arch=swh.lister.arch:register
         lister.aur=swh.lister.aur:register
         lister.bitbucket=swh.lister.bitbucket:register
         lister.bower=swh.lister.bower:register
         lister.cgit=swh.lister.cgit:register
         lister.conda=swh.lister.conda:register
         lister.cpan=swh.lister.cpan:register
         lister.cran=swh.lister.cran:register
         lister.crates=swh.lister.crates:register
         lister.debian=swh.lister.debian:register
         lister.gitea=swh.lister.gitea:register
         lister.github=swh.lister.github:register
         lister.gitlab=swh.lister.gitlab:register
         lister.gnu=swh.lister.gnu:register
         lister.golang=swh.lister.golang:register
         lister.hackage=swh.lister.hackage:register
         lister.launchpad=swh.lister.launchpad:register
         lister.nixguix=swh.lister.nixguix:register
         lister.npm=swh.lister.npm:register
         lister.nuget=swh.lister.nuget:register
         lister.opam=swh.lister.opam:register
         lister.packagist=swh.lister.packagist:register
         lister.phabricator=swh.lister.phabricator:register
         lister.pubdev=swh.lister.pubdev:register
         lister.puppet=swh.lister.puppet:register
         lister.pypi=swh.lister.pypi:register
         lister.rubygems=swh.lister.rubygems:register
         lister.sourceforge=swh.lister.sourceforge:register
         lister.tuleap=swh.lister.tuleap:register
         lister.maven=swh.lister.maven:register
         lister.gogs=swh.lister.gogs:register
+        lister.fedora=swh.lister.fedora:register
     """,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Development Status :: 5 - Production/Stable",
     ],
     project_urls={
         "Bug Reports": "https://forge.softwareheritage.org/maniphest",
         "Funding": "https://www.softwareheritage.org/donate",
         "Source": "https://forge.softwareheritage.org/source/swh-lister",
         "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/",
     },
 )
diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO
index 487ccb8..e4b3dfd 100644
--- a/swh.lister.egg-info/PKG-INFO
+++ b/swh.lister.egg-info/PKG-INFO
@@ -1,125 +1,126 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 4.2.0
+Version: 4.3.0
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.golang`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 - `swh.lister.gogs`
+- `swh.liser.fedora`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt
index 863873c..2d6eca7 100644
--- a/swh.lister.egg-info/SOURCES.txt
+++ b/swh.lister.egg-info/SOURCES.txt
@@ -1,422 +1,440 @@
 .git-blame-ignore-revs
 .gitignore
 .pre-commit-config.yaml
 ACKNOWLEDGEMENTS
 CODE_OF_CONDUCT.md
 CONTRIBUTORS
 LICENSE
 MANIFEST.in
 Makefile
 README.md
 conftest.py
 mypy.ini
 pyproject.toml
 pytest.ini
 requirements-swh.txt
 requirements-test.txt
 requirements.txt
 setup.cfg
 setup.py
 tox.ini
 docs/.gitignore
 docs/Makefile
 docs/cli.rst
 docs/conf.py
 docs/index.rst
 docs/new_lister_template.py
 docs/run_a_new_lister.rst
 docs/save_forge.rst
 docs/tutorial.rst
 docs/_static/.placeholder
 docs/_templates/.placeholder
 docs/images/new_base.png
 docs/images/new_bitbucket_lister.png
 docs/images/new_github_lister.png
 docs/images/old_github_lister.png
 sql/crawler.sql
 sql/pimp_db.sql
 swh/__init__.py
 swh.lister.egg-info/PKG-INFO
 swh.lister.egg-info/SOURCES.txt
 swh.lister.egg-info/dependency_links.txt
 swh.lister.egg-info/entry_points.txt
 swh.lister.egg-info/requires.txt
 swh.lister.egg-info/top_level.txt
 swh/lister/__init__.py
 swh/lister/cli.py
 swh/lister/pattern.py
 swh/lister/py.typed
 swh/lister/utils.py
 swh/lister/arch/__init__.py
 swh/lister/arch/lister.py
 swh/lister/arch/tasks.py
 swh/lister/arch/tests/__init__.py
 swh/lister/arch/tests/test_lister.py
 swh/lister/arch/tests/test_tasks.py
 swh/lister/arch/tests/data/fake_archlinux_archives_init.sh
 swh/lister/arch/tests/data/https_archive.archlinux.org/packages_d_dialog
 swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gnome-code-assistance
 swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gzip
 swh/lister/arch/tests/data/https_archive.archlinux.org/packages_l_libasyncns
 swh/lister/arch/tests/data/https_archive.archlinux.org/packages_m_mercurial
 swh/lister/arch/tests/data/https_archive.archlinux.org/packages_p_python-hglib
 swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_community_os_x86_64_community.files.tar.gz
 swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_core_os_x86_64_core.files.tar.gz
 swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_extra_os_x86_64_extra.files.tar.gz
 swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_community_community.files.tar.gz
 swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_core.files.tar.gz
 swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_extra_extra.files.tar.gz
 swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_community_community.files.tar.gz
 swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_core_core.files.tar.gz
 swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_extra_extra.files.tar.gz
 swh/lister/aur/__init__.py
 swh/lister/aur/lister.py
 swh/lister/aur/tasks.py
 swh/lister/aur/tests/__init__.py
 swh/lister/aur/tests/test_lister.py
 swh/lister/aur/tests/test_tasks.py
 swh/lister/aur/tests/data/fake_aur_packages.sh
 swh/lister/aur/tests/data/packages-meta-v1.json.gz
 swh/lister/bitbucket/__init__.py
 swh/lister/bitbucket/lister.py
 swh/lister/bitbucket/tasks.py
 swh/lister/bitbucket/tests/__init__.py
 swh/lister/bitbucket/tests/test_lister.py
 swh/lister/bitbucket/tests/test_tasks.py
 swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json
 swh/lister/bitbucket/tests/data/bb_api_repositories_page2.json
 swh/lister/bower/__init__.py
 swh/lister/bower/lister.py
 swh/lister/bower/tasks.py
 swh/lister/bower/tests/__init__.py
 swh/lister/bower/tests/test_lister.py
 swh/lister/bower/tests/test_tasks.py
 swh/lister/bower/tests/data/https_registry.bower.io/packages
 swh/lister/cgit/__init__.py
 swh/lister/cgit/lister.py
 swh/lister/cgit/tasks.py
 swh/lister/cgit/tests/__init__.py
 swh/lister/cgit/tests/repo_list.txt
 swh/lister/cgit/tests/test_lister.py
 swh/lister/cgit/tests/test_tasks.py
 swh/lister/cgit/tests/data/https_git.acdw.net/README
 swh/lister/cgit/tests/data/https_git.acdw.net/cgit
 swh/lister/cgit/tests/data/https_git.acdw.net/foo
 swh/lister/cgit/tests/data/https_git.acdw.net/foo_summary
 swh/lister/cgit/tests/data/https_git.acdw.net/sfeed
 swh/lister/cgit/tests/data/https_git.acdw.net/sfeed_summary
 swh/lister/cgit/tests/data/https_git.baserock.org/cgit
 swh/lister/cgit/tests/data/https_git.eclipse.org/c
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/README
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit_elisp-es.git
 swh/lister/cgit/tests/data/https_git.tizen/README
 swh/lister/cgit/tests/data/https_git.tizen/cgit
 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=100
 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=50
 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Projects
 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Users
 swh/lister/cgit/tests/data/https_git.tizen/cgit_Lock-Projects
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-base
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-mc1n2
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e3250
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e4x12
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_devices_nfc-plugin-nxp
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_intel_mfld_bootstub-mfld-blackbay
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_mtdev
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_opengl-es-virtual-drv
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libdrm
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libnl
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_xorg_driver_xserver-xorg-misc
 swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-gallery-efl
 swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-homescreen-efl
 swh/lister/cgit/tests/data/https_jff.email/cgit
 swh/lister/conda/__init__.py
 swh/lister/conda/lister.py
 swh/lister/conda/tasks.py
 swh/lister/conda/tests/__init__.py
 swh/lister/conda/tests/test_lister.py
 swh/lister/conda/tests/test_tasks.py
 swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2
 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2
 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2
 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2
 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2
 swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2
 swh/lister/cpan/__init__.py
 swh/lister/cpan/lister.py
 swh/lister/cpan/tasks.py
 swh/lister/cpan/tests/__init__.py
 swh/lister/cpan/tests/test_lister.py
 swh/lister/cpan/tests/test_tasks.py
 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page1
 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page2
 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page3
 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1__search_scroll_page4
 swh/lister/cpan/tests/data/https_fastapi.metacpan.org/v1_release__search
 swh/lister/cran/__init__.py
 swh/lister/cran/list_all_packages.R
 swh/lister/cran/lister.py
 swh/lister/cran/tasks.py
 swh/lister/cran/tests/__init__.py
 swh/lister/cran/tests/test_lister.py
 swh/lister/cran/tests/test_tasks.py
 swh/lister/cran/tests/data/list-r-packages.json
 swh/lister/crates/__init__.py
 swh/lister/crates/lister.py
 swh/lister/crates/tasks.py
 swh/lister/crates/tests/__init__.py
 swh/lister/crates/tests/test_lister.py
 swh/lister/crates/tests/test_tasks.py
 swh/lister/crates/tests/data/fake_crates_repository_init.sh
 swh/lister/crates/tests/data/https_static.crates.io/db-dump.tar.gz
 swh/lister/crates/tests/data/https_static.crates.io/db-dump.tar.gz_visit1
 swh/lister/debian/__init__.py
 swh/lister/debian/lister.py
 swh/lister/debian/tasks.py
 swh/lister/debian/tests/__init__.py
 swh/lister/debian/tests/test_lister.py
 swh/lister/debian/tests/test_tasks.py
 swh/lister/debian/tests/data/Sources_bullseye
 swh/lister/debian/tests/data/Sources_buster
 swh/lister/debian/tests/data/Sources_stretch
+swh/lister/fedora/__init__.py
+swh/lister/fedora/lister.py
+swh/lister/fedora/tasks.py
+swh/lister/fedora/tests/__init__.py
+swh/lister/fedora/tests/test_lister.py
+swh/lister/fedora/tests/test_tasks.py
+swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz
+swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz
+swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz
+swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml
+swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml
 swh/lister/gitea/__init__.py
 swh/lister/gitea/lister.py
 swh/lister/gitea/tasks.py
 swh/lister/gitea/tests/__init__.py
 swh/lister/gitea/tests/test_lister.py
 swh/lister/gitea/tests/test_tasks.py
 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1
 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2
 swh/lister/github/__init__.py
 swh/lister/github/lister.py
 swh/lister/github/tasks.py
 swh/lister/github/utils.py
 swh/lister/github/tests/__init__.py
 swh/lister/github/tests/test_lister.py
 swh/lister/github/tests/test_tasks.py
 swh/lister/gitlab/__init__.py
 swh/lister/gitlab/lister.py
 swh/lister/gitlab/tasks.py
 swh/lister/gitlab/tests/__init__.py
 swh/lister/gitlab/tests/test_lister.py
 swh/lister/gitlab/tests/test_tasks.py
 swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json
 swh/lister/gitlab/tests/data/https_gitlab.com/api_response_page1.json
 swh/lister/gnu/__init__.py
 swh/lister/gnu/lister.py
 swh/lister/gnu/tasks.py
 swh/lister/gnu/tree.py
 swh/lister/gnu/tests/__init__.py
 swh/lister/gnu/tests/test_lister.py
 swh/lister/gnu/tests/test_tasks.py
 swh/lister/gnu/tests/test_tree.py
 swh/lister/gnu/tests/data/tree.json
 swh/lister/gnu/tests/data/tree.min.json
 swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz
 swh/lister/gogs/__init__.py
 swh/lister/gogs/lister.py
 swh/lister/gogs/tasks.py
 swh/lister/gogs/tests/__init__.py
 swh/lister/gogs/tests/test_lister.py
 swh/lister/gogs/tests/test_tasks.py
 swh/lister/gogs/tests/data/https_try.gogs.io/repos_page1
 swh/lister/gogs/tests/data/https_try.gogs.io/repos_page2
 swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3
 swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4
 swh/lister/golang/__init__.py
 swh/lister/golang/lister.py
 swh/lister/golang/tasks.py
 swh/lister/golang/tests/__init__.py
 swh/lister/golang/tests/test_lister.py
 swh/lister/golang/tests/test_tasks.py
 swh/lister/golang/tests/data/page-1.txt
 swh/lister/golang/tests/data/page-2.txt
 swh/lister/golang/tests/data/page-3.txt
 swh/lister/hackage/__init__.py
 swh/lister/hackage/lister.py
 swh/lister/hackage/tasks.py
 swh/lister/hackage/tests/__init__.py
 swh/lister/hackage/tests/test_lister.py
 swh/lister/hackage/tests/test_tasks.py
 swh/lister/hackage/tests/data/https_fake49.haskell.org/packages_search_0
 swh/lister/hackage/tests/data/https_fake51.haskell.org/packages_search_0
 swh/lister/hackage/tests/data/https_fake51.haskell.org/packages_search_1
 swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0
+swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1
+swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2
 swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_1
 swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_2
 swh/lister/launchpad/__init__.py
 swh/lister/launchpad/lister.py
 swh/lister/launchpad/tasks.py
 swh/lister/launchpad/tests/__init__.py
 swh/lister/launchpad/tests/conftest.py
 swh/lister/launchpad/tests/test_lister.py
 swh/lister/launchpad/tests/test_tasks.py
 swh/lister/launchpad/tests/data/launchpad_bzr_response.json
 swh/lister/launchpad/tests/data/launchpad_response1.json
 swh/lister/launchpad/tests/data/launchpad_response2.json
 swh/lister/maven/README.md
 swh/lister/maven/__init__.py
 swh/lister/maven/lister.py
 swh/lister/maven/tasks.py
 swh/lister/maven/tests/__init__.py
 swh/lister/maven/tests/test_lister.py
 swh/lister/maven/tests/test_tasks.py
 swh/lister/maven/tests/data/citrus-parent-3.0.7.pom
 swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom
 swh/lister/maven/tests/data/sprova4j-0.1.0.malformed.pom
 swh/lister/maven/tests/data/http_indexes/export_full.fld
 swh/lister/maven/tests/data/http_indexes/export_incr_first.fld
 swh/lister/maven/tests/data/http_indexes/export_null_mtime.fld
 swh/lister/maven/tests/data/https_api.github.com/repos_aldialimucaj_sprova4j
 swh/lister/maven/tests/data/https_api.github.com/repos_arangodb-community_arangodb-graphql-java
 swh/lister/maven/tests/data/https_api.github.com/repos_webx_citrus
 swh/lister/maven/tests/data/https_repo1.maven.org/maven2_al_aldi_sprova4j_0.1.0_sprova4j-0.1.0.pom
 swh/lister/maven/tests/data/https_repo1.maven.org/maven2_al_aldi_sprova4j_0.1.1_sprova4j-0.1.1.pom
 swh/lister/maven/tests/data/https_repo1.maven.org/maven2_com_arangodb_arangodb-graphql_1.2_arangodb-graphql-1.2.pom
 swh/lister/nixguix/__init__.py
 swh/lister/nixguix/lister.py
 swh/lister/nixguix/tasks.py
 swh/lister/nixguix/tests/__init__.py
 swh/lister/nixguix/tests/test_lister.py
 swh/lister/nixguix/tests/test_tasks.py
 swh/lister/nixguix/tests/data/sources-failure.json
 swh/lister/nixguix/tests/data/sources-success.json
 swh/lister/npm/__init__.py
 swh/lister/npm/lister.py
 swh/lister/npm/tasks.py
 swh/lister/npm/tests/test_lister.py
 swh/lister/npm/tests/test_tasks.py
 swh/lister/npm/tests/data/npm_full_page1.json
 swh/lister/npm/tests/data/npm_full_page2.json
 swh/lister/npm/tests/data/npm_incremental_page1.json
 swh/lister/npm/tests/data/npm_incremental_page2.json
 swh/lister/nuget/__init__.py
 swh/lister/nuget/lister.py
 swh/lister/nuget/tasks.py
 swh/lister/nuget/tests/__init__.py
 swh/lister/nuget/tests/test_lister.py
 swh/lister/nuget/tests/test_tasks.py
 swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_intersoft.crosslight.logging.entityframework_5.0.5000.1235-experimental_intersoft.crosslight.logging.entityframework.nuspec
+swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec
 swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_sil.core.desktop_10.0.1-beta0012_sil.core.desktop.nuspec
 swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.09.23.08.07.54_sil.core.desktop.10.0.1-beta0012.json
 swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.09.23.09.10.26_intersoft.crosslight.logging.entityframework.5.0.5000.1235-experimental.json
+swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json
 swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json
+swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1
 swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page11702.json
 swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page16958.json
+swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json
 swh/lister/opam/__init__.py
 swh/lister/opam/lister.py
 swh/lister/opam/tasks.py
 swh/lister/opam/tests/__init__.py
 swh/lister/opam/tests/test_lister.py
 swh/lister/opam/tests/test_tasks.py
 swh/lister/opam/tests/data/fake_opam_repo/repo
 swh/lister/opam/tests/data/fake_opam_repo/version
 swh/lister/opam/tests/data/fake_opam_repo/packages/agrid/agrid.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.2/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.3/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.4/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.5/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.6/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.2/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.3/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/ocb/ocb.0.1/opam
 swh/lister/packagist/__init__.py
 swh/lister/packagist/lister.py
 swh/lister/packagist/tasks.py
 swh/lister/packagist/tests/__init__.py
 swh/lister/packagist/tests/test_lister.py
 swh/lister/packagist/tests/test_tasks.py
 swh/lister/packagist/tests/data/den1n_contextmenu.json
 swh/lister/packagist/tests/data/idevlab_essential.json
 swh/lister/packagist/tests/data/ljjackson_linnworks.json
 swh/lister/packagist/tests/data/lky_wx_article.json
 swh/lister/packagist/tests/data/payrix_payrix-php.json
 swh/lister/packagist/tests/data/spryker-eco_computop-api.json
 swh/lister/packagist/tests/data/with_invalid_url.json
 swh/lister/packagist/tests/data/ycms_module-main.json
 swh/lister/packagist/tests/data/https_api.github.com/repos_gitlky_wx_article
 swh/lister/packagist/tests/data/https_api.github.com/repos_spryker-eco_computop-api
 swh/lister/packagist/tests/data/https_api.github.com/repos_ycms_module-main
 swh/lister/phabricator/__init__.py
 swh/lister/phabricator/lister.py
 swh/lister/phabricator/tasks.py
 swh/lister/phabricator/tests/__init__.py
 swh/lister/phabricator/tests/test_lister.py
 swh/lister/phabricator/tests/test_tasks.py
 swh/lister/phabricator/tests/data/__init__.py
 swh/lister/phabricator/tests/data/phabricator_api_repositories_page1.json
 swh/lister/phabricator/tests/data/phabricator_api_repositories_page2.json
 swh/lister/pubdev/__init__.py
 swh/lister/pubdev/lister.py
 swh/lister/pubdev/tasks.py
 swh/lister/pubdev/tests/__init__.py
 swh/lister/pubdev/tests/test_lister.py
 swh/lister/pubdev/tests/test_tasks.py
 swh/lister/pubdev/tests/data/https_pub.dev/api_package-names
 swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker
 swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon
 swh/lister/puppet/__init__.py
 swh/lister/puppet/lister.py
 swh/lister/puppet/tasks.py
 swh/lister/puppet/tests/__init__.py
 swh/lister/puppet/tests/test_lister.py
 swh/lister/puppet/tests/test_tasks.py
 swh/lister/puppet/tests/data/https_forgeapi.puppet.com/v3_modules,limit=100
 swh/lister/puppet/tests/data/https_forgeapi.puppet.com/v3_modules,limit=100,offset=100
+swh/lister/puppet/tests/data/https_forgeapi.puppet.com/v3_modules,limit=100,with_release_since=2022-09-26
 swh/lister/pypi/__init__.py
 swh/lister/pypi/lister.py
 swh/lister/pypi/tasks.py
 swh/lister/pypi/tests/__init__.py
 swh/lister/pypi/tests/test_lister.py
 swh/lister/pypi/tests/test_tasks.py
 swh/lister/rubygems/__init__.py
 swh/lister/rubygems/lister.py
 swh/lister/rubygems/tasks.py
 swh/lister/rubygems/tests/__init__.py
 swh/lister/rubygems/tests/test_lister.py
 swh/lister/rubygems/tests/test_tasks.py
 swh/lister/rubygems/tests/data/rubygems_dumps.xml
 swh/lister/rubygems/tests/data/rubygems_pgsql_dump.tar
 swh/lister/rubygems/tests/data/small_rubygems_dump.sh
 swh/lister/sourceforge/__init__.py
 swh/lister/sourceforge/lister.py
 swh/lister/sourceforge/tasks.py
 swh/lister/sourceforge/tests/__init__.py
 swh/lister/sourceforge/tests/test_lister.py
 swh/lister/sourceforge/tests/test_tasks.py
 swh/lister/sourceforge/tests/data/aaron.html
 swh/lister/sourceforge/tests/data/aaron.json
 swh/lister/sourceforge/tests/data/adobexmp.json
 swh/lister/sourceforge/tests/data/backapps-website.json
 swh/lister/sourceforge/tests/data/backapps.json
 swh/lister/sourceforge/tests/data/main-sitemap.xml
 swh/lister/sourceforge/tests/data/mojunk.json
 swh/lister/sourceforge/tests/data/mramm.json
 swh/lister/sourceforge/tests/data/ocaml-lpd.html
 swh/lister/sourceforge/tests/data/ocaml-lpd.json
 swh/lister/sourceforge/tests/data/os3dmodels.json
 swh/lister/sourceforge/tests/data/random-mercurial.json
 swh/lister/sourceforge/tests/data/subsitemap-0.xml
 swh/lister/sourceforge/tests/data/subsitemap-1.xml
 swh/lister/sourceforge/tests/data/t12eksandbox.html
 swh/lister/sourceforge/tests/data/t12eksandbox.json
 swh/lister/tests/__init__.py
 swh/lister/tests/test_cli.py
 swh/lister/tests/test_pattern.py
 swh/lister/tests/test_utils.py
 swh/lister/tuleap/__init__.py
 swh/lister/tuleap/lister.py
 swh/lister/tuleap/tasks.py
 swh/lister/tuleap/tests/__init__.py
 swh/lister/tuleap/tests/test_lister.py
 swh/lister/tuleap/tests/test_tasks.py
 swh/lister/tuleap/tests/data/https_tuleap.net/projects
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_1
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_2
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_3
\ No newline at end of file
diff --git a/swh.lister.egg-info/entry_points.txt b/swh.lister.egg-info/entry_points.txt
index a31a0c1..153682a 100644
--- a/swh.lister.egg-info/entry_points.txt
+++ b/swh.lister.egg-info/entry_points.txt
@@ -1,35 +1,36 @@
 [swh.cli.subcommands]
 lister = swh.lister.cli
 
 [swh.workers]
 lister.arch = swh.lister.arch:register
 lister.aur = swh.lister.aur:register
 lister.bitbucket = swh.lister.bitbucket:register
 lister.bower = swh.lister.bower:register
 lister.cgit = swh.lister.cgit:register
 lister.conda = swh.lister.conda:register
 lister.cpan = swh.lister.cpan:register
 lister.cran = swh.lister.cran:register
 lister.crates = swh.lister.crates:register
 lister.debian = swh.lister.debian:register
+lister.fedora = swh.lister.fedora:register
 lister.gitea = swh.lister.gitea:register
 lister.github = swh.lister.github:register
 lister.gitlab = swh.lister.gitlab:register
 lister.gnu = swh.lister.gnu:register
 lister.gogs = swh.lister.gogs:register
 lister.golang = swh.lister.golang:register
 lister.hackage = swh.lister.hackage:register
 lister.launchpad = swh.lister.launchpad:register
 lister.maven = swh.lister.maven:register
 lister.nixguix = swh.lister.nixguix:register
 lister.npm = swh.lister.npm:register
 lister.nuget = swh.lister.nuget:register
 lister.opam = swh.lister.opam:register
 lister.packagist = swh.lister.packagist:register
 lister.phabricator = swh.lister.phabricator:register
 lister.pubdev = swh.lister.pubdev:register
 lister.puppet = swh.lister.puppet:register
 lister.pypi = swh.lister.pypi:register
 lister.rubygems = swh.lister.rubygems:register
 lister.sourceforge = swh.lister.sourceforge:register
 lister.tuleap = swh.lister.tuleap:register
diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt
index 6aaf3f2..2944ed4 100644
--- a/swh.lister.egg-info/requires.txt
+++ b/swh.lister.egg-info/requires.txt
@@ -1,21 +1,22 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
 tenacity>=6.2
 lxml
 dulwich
 testing.postgresql
 psycopg2
+repomd
 swh.core[db,github]>=2.16.1
 swh.scheduler>=0.8
 
 [testing]
 pytest
 pytest-mock
 requests_mock
 types-click
 types-pyyaml
 types-requests
diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py
index 563fa18..c281f22 100644
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@@ -1,482 +1,488 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 import logging
 from pathlib import Path
 import re
 import tarfile
 import tempfile
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import unquote, urljoin
 
 from bs4 import BeautifulSoup
 
 from swh.model.hashutil import hash_to_hex
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 ArchListerPage = List[Dict[str, Any]]
 
 
 def size_to_bytes(size: str) -> int:
     """Convert human readable file size to bytes.
 
     Resulting value is an approximation as input value is in most case rounded.
 
     Args:
         size: A string representing a human readable file size (eg: '500K')
 
     Returns:
         A decimal representation of file size
 
         Examples::
 
             >>> size_to_bytes("500")
             500
             >>> size_to_bytes("1K")
             1000
     """
     units = {
         "K": 1000,
         "M": 1000**2,
         "G": 1000**3,
         "T": 1000**4,
         "P": 1000**5,
         "E": 1000**6,
         "Z": 1000**7,
         "Y": 1000**8,
     }
     if size.endswith(tuple(units)):
         v, u = (size[:-1], size[-1])
         return int(v) * units[u]
     else:
         return int(size)
 
 
 class ArchLister(StatelessLister[ArchListerPage]):
     """List Arch linux origins from 'core', 'extra', and 'community' repositories
 
     For 'official' Arch Linux it downloads core.tar.gz, extra.tar.gz and community.tar.gz
     from https://archive.archlinux.org/repos/last/ extract to a temp directory and
     then walks through each 'desc' files.
 
     Each 'desc' file describe the latest released version of a package and helps
     to build an origin url from where scrapping artifacts metadata.
 
     For 'arm' Arch Linux it follow the same discovery process parsing 'desc' files.
     The main difference is that we can't get existing versions of an arm package
     because https://archlinuxarm.org does not have an 'archive' website or api.
     """
 
     LISTER_NAME = "arch"
     VISIT_TYPE = "arch"
     INSTANCE = "arch"
 
     ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}"
     ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}"
     ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = (
         "{base_url}/packages/{pkgname[0]}/{pkgname}/{filename}"
     )
     ARCH_API_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}/json"
 
     ARM_PACKAGE_URL_PATTERN = "{base_url}/packages/{arch}/{pkgname}"
     ARM_PACKAGE_DOWNLOAD_URL_PATTERN = "{base_url}/{arch}/{repo}/{filename}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         flavours: Dict[str, Any] = {
             "official": {
                 "archs": ["x86_64"],
                 "repos": ["core", "extra", "community"],
                 "base_info_url": "https://archlinux.org",
                 "base_archive_url": "https://archive.archlinux.org",
                 "base_mirror_url": "",
                 "base_api_url": "https://archlinux.org",
             },
             "arm": {
                 "archs": ["armv7h", "aarch64"],
                 "repos": ["core", "extra", "community"],
                 "base_info_url": "https://archlinuxarm.org",
                 "base_archive_url": "",
                 "base_mirror_url": "https://uk.mirror.archlinuxarm.org",
                 "base_api_url": "",
             },
         },
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=flavours["official"]["base_info_url"],
             instance=self.INSTANCE,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.flavours = flavours
 
     def scrap_package_versions(
         self, name: str, repo: str, base_url: str
     ) -> List[Dict[str, Any]]:
         """Given a package 'name' and 'repo', make an http call to origin url and parse its content
         to get package versions artifacts data.
         That method is suitable only for 'official' Arch Linux, not 'arm'.
 
         Args:
             name: Package name
             repo: The repository the package belongs to (one of self.repos)
 
         Returns:
             A list of dict of version
 
             Example::
 
                 [
                     {"url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz",  # noqa: B950
                     "arch": "x86_64",
                     "repo": "core",
                     "name": "dialog",
                     "version": "1:1.3_20190211-1",
                     "length": 180000,
                     "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz",
                     "last_modified": "2019-02-13T08:36:00"},
                 ]
         """
         url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
             pkgname=name, base_url=base_url
         )
         response = self.http_request(url)
         soup = BeautifulSoup(response.text, "html.parser")
         links = soup.find_all("a", href=True)
 
         # drop the first line (used to go to up directory)
         if links[0].attrs["href"] == "../":
             links.pop(0)
 
         versions = []
 
         for link in links:
             # filename displayed can be cropped if name is too long, get it from href instead
             filename = unquote(link.attrs["href"])
 
             if filename.endswith((".tar.xz", ".tar.zst")):
                 # Extract arch from filename
                 arch_rex = re.compile(
                     rf"^{re.escape(name)}-(?P<version>.*)-(?P<arch>any|i686|x86_64)"
                     rf"(.pkg.tar.(?:zst|xz))$"
                 )
                 m = arch_rex.match(filename)
                 if m is None:
                     logger.error(
                         "Can not find a match for architecture in %(filename)s",
                         dict(filename=filename),
                     )
                 else:
                     arch = m.group("arch")
                     version = m.group("version")
 
                 # Extract last_modified and an approximate file size
                 raw_text = link.next_sibling
                 raw_text_rex = re.compile(
                     r"^(?P<last_modified>\d+-\w+-\d+ \d\d:\d\d)\s+(?P<size>\w+)$"
                 )
                 s = raw_text_rex.search(raw_text.strip())
                 if s is None:
                     logger.error(
                         "Can not find a match for 'last_modified' and/or "
                         "'size' in '%(raw_text)s'",
                         dict(raw_text=raw_text),
                     )
                 else:
                     assert s.groups()
                     assert len(s.groups()) == 2
                     last_modified_str, size = s.groups()
 
                 # format as expected
                 last_modified = datetime.datetime.strptime(
                     last_modified_str, "%d-%b-%Y %H:%M"
                 ).isoformat()
 
                 length = size_to_bytes(size)  # we want bytes
 
                 # link url is relative, format a canonical one
                 url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format(
                     base_url=base_url, pkgname=name, filename=filename
                 )
                 versions.append(
                     dict(
                         name=name,
                         version=version,
                         repo=repo,
                         arch=arch,
                         filename=filename,
                         url=url,
                         last_modified=last_modified,
                         length=length,
                     )
                 )
         return versions
 
     def get_repo_archive(self, url: str, destination_path: Path) -> Path:
         """Given an url and a destination path, retrieve and extract .tar.gz archive
         which contains 'desc' file for each package.
         Each .tar.gz archive corresponds to an Arch Linux repo ('core', 'extra', 'community').
 
         Args:
             url: url of the .tar.gz archive to download
             destination_path: the path on disk where to extract archive
 
         Returns:
             a directory Path where the archive has been extracted to.
         """
         res = self.http_request(url)
         destination_path.parent.mkdir(parents=True, exist_ok=True)
         destination_path.write_bytes(res.content)
 
         extract_to = Path(str(destination_path).split(".tar.gz")[0])
         tar = tarfile.open(destination_path)
         tar.extractall(path=extract_to)
         tar.close()
 
         return extract_to
 
     def parse_desc_file(
         self,
         path: Path,
         repo: str,
         base_url: str,
         dl_url_fmt: str,
     ) -> Dict[str, Any]:
         """Extract package information from a 'desc' file.
         There are subtle differences between parsing 'official' and 'arm' des files
 
         Args:
             path: A path to a 'desc' file on disk
             repo: The repo the package belongs to
 
         Returns:
             A dict of metadata
 
             Example::
 
                 {'api_url': 'https://archlinux.org/packages/core/x86_64/dialog/json',
                  'arch': 'x86_64',
                  'base': 'dialog',
                  'builddate': '1650081535',
                  'csize': '203028',
                  'desc': 'A tool to display dialog boxes from shell scripts',
                  'filename': 'dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst',
                  'isize': '483988',
                  'license': 'LGPL2.1',
                  'md5sum': '06407c0cb11c50d7bf83d600f2e8107c',
                  'name': 'dialog',
                  'packager': 'Evangelos Foutras <foutrelis@archlinux.org>',
                  'pgpsig': 'pgpsig content xxx',
                  'project_url': 'https://invisible-island.net/dialog/',
                  'provides': 'libdialog.so=15-64',
                  'repo': 'core',
                  'sha256sum': 'ef8c8971f591de7db0f455970ef5d81d5aced1ddf139f963f16f6730b1851fa7',
                  'url': 'https://archive.archlinux.org/packages/.all/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst',  # noqa: B950
                  'version': '1:1.3_20220414-1'}
         """
         rex = re.compile(r"^\%(?P<k>\w+)\%\n(?P<v>.*)\n$", re.M)
         with path.open("rb") as content:
             parsed = rex.findall(content.read().decode())
             data = {entry[0].lower(): entry[1] for entry in parsed}
 
             if "url" in data.keys():
                 data["project_url"] = data["url"]
 
             assert data["name"]
             assert data["filename"]
             assert data["arch"]
 
             data["repo"] = repo
             data["url"] = urljoin(
                 base_url,
                 dl_url_fmt.format(
                     base_url=base_url,
                     pkgname=data["name"],
                     filename=data["filename"],
                     arch=data["arch"],
                     repo=repo,
                 ),
             )
 
             assert data["md5sum"]
             assert data["sha256sum"]
             data["checksums"] = {
                 "md5sum": hash_to_hex(data["md5sum"]),
                 "sha256sum": hash_to_hex(data["sha256sum"]),
             }
         return data
 
     def get_pages(self) -> Iterator[ArchListerPage]:
         """Yield an iterator sorted by name in ascending order of pages.
 
         Each page is a list of package belonging to a flavour ('official', 'arm'),
         and a repo ('core', 'extra', 'community')
         """
 
         for name, flavour in self.flavours.items():
             for arch in flavour["archs"]:
                 for repo in flavour["repos"]:
                     yield self._get_repo_page(name, flavour, arch, repo)
 
     def _get_repo_page(
         self, name: str, flavour: Dict[str, Any], arch: str, repo: str
     ) -> ArchListerPage:
         with tempfile.TemporaryDirectory() as tmpdir:
             page = []
             if name == "official":
                 prefix = urljoin(flavour["base_archive_url"], "/repos/last/")
                 filename = f"{repo}.files.tar.gz"
                 archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}")
                 destination_path = Path(tmpdir, arch, filename)
                 base_url = flavour["base_archive_url"]
                 dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN
                 base_info_url = flavour["base_info_url"]
                 info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN
             elif name == "arm":
                 filename = f"{repo}.files.tar.gz"
                 archive_url = urljoin(
                     flavour["base_mirror_url"], f"{arch}/{repo}/{filename}"
                 )
                 destination_path = Path(tmpdir, arch, filename)
                 base_url = flavour["base_mirror_url"]
                 dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN
                 base_info_url = flavour["base_info_url"]
                 info_url_fmt = self.ARM_PACKAGE_URL_PATTERN
 
             archive = self.get_repo_archive(
                 url=archive_url, destination_path=destination_path
             )
 
             assert archive
 
             packages_desc = list(archive.glob("**/desc"))
             logger.debug(
                 "Processing %(instance)s source packages info from "
                 "%(flavour)s %(arch)s %(repo)s repository, "
                 "(%(qty)s packages).",
                 dict(
                     instance=self.instance,
                     flavour=name,
                     arch=arch,
                     repo=repo,
                     qty=len(packages_desc),
                 ),
             )
 
             for package_desc in packages_desc:
                 data = self.parse_desc_file(
                     path=package_desc,
                     repo=repo,
                     base_url=base_url,
                     dl_url_fmt=dl_url_fmt,
                 )
 
                 assert data["builddate"]
                 last_modified = datetime.datetime.fromtimestamp(
                     float(data["builddate"]), tz=datetime.timezone.utc
                 )
 
                 assert data["name"]
                 assert data["filename"]
                 assert data["arch"]
                 url = info_url_fmt.format(
                     base_url=base_info_url,
                     pkgname=data["name"],
                     filename=data["filename"],
                     repo=repo,
                     arch=data["arch"],
                 )
 
                 assert data["version"]
                 if name == "official":
                     # find all versions of a package scrapping archive
                     versions = self.scrap_package_versions(
                         name=data["name"], repo=repo, base_url=base_url
                     )
                 elif name == "arm":
                     # There is no way to get related versions of a package,
                     # but 'data' represents the latest released version,
                     # use it in this case
                     assert data["builddate"]
                     assert data["csize"]
                     assert data["url"]
                     versions = [
                         dict(
                             name=data["name"],
                             version=data["version"],
                             repo=repo,
                             arch=data["arch"],
                             filename=data["filename"],
                             url=data["url"],
                             last_modified=last_modified.replace(tzinfo=None).isoformat(
                                 timespec="seconds"
                             ),
                             length=int(data["csize"]),
                         )
                     ]
 
                 package = {
                     "name": data["name"],
                     "version": data["version"],
                     "last_modified": last_modified,
                     "url": url,
                     "versions": versions,
                     "data": data,
                 }
                 page.append(package)
             return page
 
     def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all arch pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
         for origin in page:
             artifacts = []
             arch_metadata = []
             for version in origin["versions"]:
                 artifacts.append(
                     {
                         "version": version["version"],
                         "filename": version["filename"],
                         "url": version["url"],
                         "length": version["length"],
                     }
                 )
                 if version["version"] == origin["version"]:
                     artifacts[-1]["checksums"] = {
                         "md5": origin["data"]["md5sum"],
                         "sha256": origin["data"]["sha256sum"],
                     }
                 else:
                     artifacts[-1]["checksums"] = {"length": version["length"]}
 
                 arch_metadata.append(
                     {
                         "version": version["version"],
                         "name": version["name"],
                         "arch": version["arch"],
                         "repo": version["repo"],
                         "last_modified": version["last_modified"],
                     }
                 )
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=origin["url"],
                 last_update=origin["last_modified"],
                 extra_loader_arguments={
                     "artifacts": artifacts,
                     "arch_metadata": arch_metadata,
                 },
             )
diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py
index 9bbdf37..dc43d7d 100644
--- a/swh/lister/aur/lister.py
+++ b/swh/lister/aur/lister.py
@@ -1,152 +1,158 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 import logging
 from typing import Any, Dict, Iterator, List, Optional
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 AurListerPage = Dict[str, Any]
 
 
 class AurLister(StatelessLister[AurListerPage]):
     """List Arch User Repository (AUR) origins.
 
     Given an url (used as a base url, default is 'https://aur.archlinux.org'),
     download a 'packages-meta-v1.json.gz' which contains a json file listing all
     existing packages definitions.
 
     Each entry describes the latest released version of a package. The origin url
     for a package is built using 'pkgname' and corresponds to a git repository.
 
     An rpc api exists but it is recommended to save bandwidth so it's not used. See
     https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html
     for more on this.
     """
 
     LISTER_NAME = "aur"
     VISIT_TYPE = "aur"
     INSTANCE = "aur"
 
     BASE_URL = "https://aur.archlinux.org"
     DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz"
     PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git"
     PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz"
     ORIGIN_URL_PATTERN = "{base_url}/packages/{pkgname}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.BASE_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
     def download_packages_index(self) -> List[Dict[str, Any]]:
         """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string,
         and download the archive to self.DESTINATION_PATH
 
         Returns:
             a directory Path where the archive has been downloaded to.
         """
         url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
         return self.http_request(url).json()
 
     def get_pages(self) -> Iterator[AurListerPage]:
         """Yield an iterator which returns 'page'
 
         Each page corresponds to a package with a 'version', an 'url' for a Git
         repository, a 'project_url' which represents the upstream project url and
         a canonical 'snapshot_url' from which a tar.gz archive of the package can
         be downloaded.
         """
         packages = self.download_packages_index()
 
         logger.debug("Found %s AUR packages in aur_index", len(packages))
 
         for package in packages:
             # Exclude lines where Name differs from PackageBase as they represents
             # split package and they don't have resolvable snapshots url
             if package["Name"] == package["PackageBase"]:
                 logger.debug("Processing AUR package %s", package["Name"])
                 pkgname = package["PackageBase"]
                 version = package["Version"]
                 project_url = package["URL"]
                 last_modified = datetime.datetime.fromtimestamp(
                     float(package["LastModified"]), tz=datetime.timezone.utc
                 ).isoformat()
                 yield {
                     "pkgname": pkgname,
                     "version": version,
                     "url": self.ORIGIN_URL_PATTERN.format(
                         base_url=self.BASE_URL, pkgname=pkgname
                     ),
                     "git_url": self.PACKAGE_VCS_URL_PATTERN.format(
                         base_url=self.BASE_URL, pkgname=pkgname
                     ),
                     "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format(
                         base_url=self.BASE_URL, pkgname=pkgname
                     ),
                     "project_url": project_url,
                     "last_modified": last_modified,
                 }
 
     def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances.
         It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata`
         entries to 'extra_loader_arguments'.
 
         `artifacts` describe the file to download and `aur_metadata` store some
         metadata that can be useful for the loader.
         """
         assert self.lister_obj.id is not None
 
         last_update = datetime.datetime.fromisoformat(origin["last_modified"])
         filename = origin["snapshot_url"].split("/")[-1]
 
         artifacts = [
             {
                 "filename": filename,
                 "url": origin["snapshot_url"],
                 "version": origin["version"],
             }
         ]
         aur_metadata = [
             {
                 "version": origin["version"],
                 "project_url": origin["project_url"],
                 "last_update": origin["last_modified"],
                 "pkgname": origin["pkgname"],
             }
         ]
 
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type=self.VISIT_TYPE,
             url=origin["url"],
             last_update=last_update,
             extra_loader_arguments={
                 "artifacts": artifacts,
                 "aur_metadata": aur_metadata,
             },
         )
 
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type="git",
             url=origin["git_url"],
             last_update=last_update,
         )
diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py
index 7bcec03..05720c9 100644
--- a/swh/lister/bitbucket/lister.py
+++ b/swh/lister/bitbucket/lister.py
@@ -1,173 +1,179 @@
 # Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 from datetime import datetime
 import logging
 import random
 from typing import Any, Dict, Iterator, List, Optional
 from urllib import parse
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class BitbucketListerState:
     """State of Bitbucket lister"""
 
     last_repo_cdate: Optional[datetime] = None
     """Creation date and time of the last listed repository during an
     incremental pass"""
 
 
 class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
     """List origins from Bitbucket using its API.
 
     Bitbucket API has the following rate-limit configuration:
 
       * 60 requests per hour for anonymous users
 
       * 1000 requests per hour for authenticated users
 
     The lister is working in anonymous mode by default but Bitbucket account
     credentials can be provided to perform authenticated requests.
     """
 
     LISTER_NAME = "bitbucket"
     INSTANCE = "bitbucket"
 
     API_URL = "https://api.bitbucket.org/2.0/repositories"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         page_size: int = 1000,
         incremental: bool = True,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.API_URL,
             instance=self.INSTANCE,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.incremental = incremental
 
         self.url_params: Dict[str, Any] = {
             "pagelen": page_size,
             # only return needed JSON fields in bitbucket API responses
             # (also prevent errors 500 when listing)
             "fields": (
                 "next,values.links.clone.href,values.scm,values.updated_on,"
                 "values.created_on"
             ),
         }
 
         self.session.headers.update({"Accept": "application/json"})
 
         if len(self.credentials) > 0:
             cred = random.choice(self.credentials)
             logger.warning("Using Bitbucket credentials from user %s", cred["username"])
             self.set_credentials(cred["username"], cred["password"])
         else:
             logger.warning("No credentials set in configuration, using anonymous mode")
 
     def state_from_dict(self, d: Dict[str, Any]) -> BitbucketListerState:
         last_repo_cdate = d.get("last_repo_cdate")
         if last_repo_cdate is not None:
             d["last_repo_cdate"] = iso8601.parse_date(last_repo_cdate)
         return BitbucketListerState(**d)
 
     def state_to_dict(self, state: BitbucketListerState) -> Dict[str, Any]:
         d = asdict(state)
         last_repo_cdate = d.get("last_repo_cdate")
         if last_repo_cdate is not None:
             d["last_repo_cdate"] = last_repo_cdate.isoformat()
         return d
 
     def set_credentials(self, username: Optional[str], password: Optional[str]) -> None:
         """Set basic authentication headers with given credentials."""
         if username is not None and password is not None:
             self.session.auth = (username, password)
 
     def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
 
         last_repo_cdate: str = "1970-01-01"
         if (
             self.incremental
             and self.state is not None
             and self.state.last_repo_cdate is not None
         ):
             last_repo_cdate = self.state.last_repo_cdate.isoformat()
 
         while True:
             self.url_params["after"] = last_repo_cdate
             body = self.http_request(self.url, params=self.url_params).json()
 
             yield body["values"]
 
             next_page_url = body.get("next")
             if next_page_url is not None:
                 next_page_url = parse.urlparse(next_page_url)
                 if not next_page_url.query:
                     logger.warning("Failed to parse url %s", next_page_url)
                     break
                 last_repo_cdate = parse.parse_qs(next_page_url.query)["after"][0]
             else:
                 # last page
                 break
 
     def get_origins_from_page(
         self, page: List[Dict[str, Any]]
     ) -> Iterator[ListedOrigin]:
         """Convert a page of Bitbucket repositories into a list of ListedOrigins."""
         assert self.lister_obj.id is not None
 
         for repo in page:
             last_update = iso8601.parse_date(repo["updated_on"])
             origin_url = repo["links"]["clone"][0]["href"]
             origin_type = repo["scm"]
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type=origin_type,
                 last_update=last_update,
             )
 
     def commit_page(self, page: List[Dict[str, Any]]) -> None:
         """Update the currently stored state using the latest listed page."""
         if self.incremental:
             last_repo = page[-1]
             last_repo_cdate = iso8601.parse_date(last_repo["created_on"])
 
             if (
                 self.state.last_repo_cdate is None
                 or last_repo_cdate > self.state.last_repo_cdate
             ):
                 self.state.last_repo_cdate = last_repo_cdate
 
     def finalize(self) -> None:
         if self.incremental:
             scheduler_state = self.get_state_from_scheduler()
 
             if self.state.last_repo_cdate is None:
                 return
 
             # Update the lister state in the backend only if the last seen id of
             # the current run is higher than that stored in the database.
             if (
                 scheduler_state.last_repo_cdate is None
                 or self.state.last_repo_cdate > scheduler_state.last_repo_cdate
             ):
                 self.updated = True
diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py
index 5b488e4..cc440dc 100644
--- a/swh/lister/bower/lister.py
+++ b/swh/lister/bower/lister.py
@@ -1,64 +1,70 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 from typing import Dict, Iterator, List, Optional
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 BowerListerPage = List[Dict[str, str]]
 
 
 class BowerLister(StatelessLister[BowerListerPage]):
     """List Bower (Javascript package manager) origins."""
 
     LISTER_NAME = "bower"
     VISIT_TYPE = "git"  # Bower origins url are Git repositories
     INSTANCE = "bower"
 
     API_URL = "https://registry.bower.io/packages"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.API_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         self.session.headers.update({"Accept": "application/json"})
 
     def get_pages(self) -> Iterator[BowerListerPage]:
         """Yield an iterator which returns 'page'
 
         It uses the api endpoint provided by `https://registry.bower.io/packages`
         to get a list of package names with an origin url that corresponds to Git
         repository.
 
         There is only one page that list all origins urls.
         """
         response = self.http_request(self.url)
         yield response.json()
 
     def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
 
         for entry in page:
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=entry["url"],
                 last_update=None,
             )
diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
index 49458d0..4a9aeab 100644
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -1,225 +1,231 @@
 # Copyright (C) 2019-2022 The Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 import logging
 import re
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin, urlparse
 
 from bs4 import BeautifulSoup
 from requests.exceptions import HTTPError
 
 from swh.lister.pattern import CredentialsType, StatelessLister
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 Repositories = List[Dict[str, Any]]
 
 
 class CGitLister(StatelessLister[Repositories]):
     """Lister class for CGit repositories.
 
     This lister will retrieve the list of published git repositories by
     parsing the HTML page(s) of the index retrieved at `url`.
 
     The lister currently defines 2 listing behaviors:
 
     - If the `base_git_url` is provided, the listed origin urls are computed out of the
       base git url link and the one listed in the main listed page (resulting in less
       HTTP queries than the 2nd behavior below). This is expected to be the main
       deployed behavior.
 
     - Otherwise (with no `base_git_url`), for each found git repository listed, one
       extra HTTP query is made at the given url found in the main listing page to gather
       published "Clone" URLs to be used as origin URL for that git repo. If several
       "Clone" urls are provided, prefer the http/https one, if any, otherwise fallback
       to the first one.
 
     """
 
     LISTER_NAME = "cgit"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: Optional[str] = None,
         credentials: Optional[CredentialsType] = None,
         base_git_url: Optional[str] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         """Lister class for CGit repositories.
 
         Args:
             url: main URL of the CGit instance, i.e. url of the index
                 of published git repositories on this instance.
             instance: Name of cgit instance. Defaults to url's network location
                 if unset.
             base_git_url: Optional base git url which allows the origin url
                 computations.
 
         """
         super().__init__(
             scheduler=scheduler,
             url=url,
             instance=instance,
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/html"})
         self.base_git_url = base_git_url
 
     def _get_and_parse(self, url: str) -> BeautifulSoup:
         """Get the given url and parse the retrieved HTML using BeautifulSoup"""
         response = self.http_request(url)
         return BeautifulSoup(response.text, features="html.parser")
 
     def get_pages(self) -> Iterator[Repositories]:
         """Generate git 'project' URLs found on the current CGit server
         The last_update date is retrieved on the list of repo page to avoid
         to compute it on the repository details which only give a date per branch
         """
         next_page: Optional[str] = self.url
         while next_page:
             bs_idx = self._get_and_parse(next_page)
 
             page_results = []
 
             for tr in bs_idx.find("div", {"class": "content"}).find_all(
                 "tr", {"class": ""}
             ):
                 repository_link = tr.find("a")["href"]
                 repo_url = None
                 git_url = None
 
                 base_url = urljoin(self.url, repository_link).strip("/")
                 if self.base_git_url:  # mapping provided
                     # computing git url
                     git_url = base_url.replace(self.url, self.base_git_url)
                 else:
                     # we compute the git detailed page url from which we will retrieve
                     # the git url (cf. self.get_origins_from_page)
                     repo_url = base_url
 
                 span = tr.find("span", {"class": re.compile("age-")})
                 last_updated_date = span.get("title") if span else None
 
                 page_results.append(
                     {
                         "url": repo_url,
                         "git_url": git_url,
                         "last_updated_date": last_updated_date,
                     }
                 )
 
             yield page_results
 
             try:
                 pager = bs_idx.find("ul", {"class": "pager"})
 
                 current_page = pager.find("a", {"class": "current"})
                 if current_page:
                     next_page = current_page.parent.next_sibling.a["href"]
                     next_page = urljoin(self.url, next_page)
             except (AttributeError, KeyError):
                 # no pager, or no next page
                 next_page = None
 
     def get_origins_from_page(
         self, repositories: Repositories
     ) -> Iterator[ListedOrigin]:
         """Convert a page of cgit repositories into a list of ListedOrigins."""
         assert self.lister_obj.id is not None
 
         for repo in repositories:
             origin_url = repo["git_url"] or self._get_origin_from_repository_url(
                 repo["url"]
             )
             if origin_url is None:
                 continue
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type="git",
                 last_update=_parse_last_updated_date(repo),
             )
 
     def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
         """Extract the git url from the repository page"""
         try:
             bs = self._get_and_parse(repository_url)
         except HTTPError as e:
             logger.warning(
                 "Unexpected HTTP status code %s on %s",
                 e.response.status_code,
                 e.response.url,
             )
             return None
 
         # check if we are on the summary tab, if not, go to this tab
         tab = bs.find("table", {"class": "tabs"})
         if tab:
             summary_a = tab.find("a", string="summary")
             if summary_a:
                 summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
 
                 if summary_url != repository_url:
                     logger.debug(
                         "%s : Active tab is not the summary, trying to load the summary page",
                         repository_url,
                     )
                     return self._get_origin_from_repository_url(summary_url)
             else:
                 logger.debug("No summary tab found on %s", repository_url)
 
         # origin urls are listed on the repository page
         # TODO check if forcing https is better or not ?
         # <link rel='vcs-git' href='git://...' title='...'/>
         # <link rel='vcs-git' href='http://...' title='...'/>
         # <link rel='vcs-git' href='https://...' title='...'/>
         urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
 
         if not urls:
             logger.debug("No git urls found on %s", repository_url)
             return None
 
         # look for the http/https url, if any, and use it as origin_url
         for url in urls:
             if urlparse(url).scheme in ("http", "https"):
                 origin_url = url
                 break
         else:
             # otherwise, choose the first one
             origin_url = urls[0]
         return origin_url
 
 
 def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]:
     """Parse the last updated date"""
     date = repository.get("last_updated_date")
     if not date:
         return None
 
     parsed_date = None
     for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"):
         try:
             parsed_date = datetime.strptime(date, date_format)
             # force UTC to avoid naive datetime
             if not parsed_date.tzinfo:
                 parsed_date = parsed_date.replace(tzinfo=timezone.utc)
             break
         except Exception:
             pass
 
     if not parsed_date:
         logger.warning(
             "Could not parse %s last_updated date: %s",
             repository["url"],
             date,
         )
 
     return parsed_date
diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py
index ab0190f..4f5cb40 100644
--- a/swh/lister/conda/lister.py
+++ b/swh/lister/conda/lister.py
@@ -1,123 +1,129 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import bz2
 from collections import defaultdict
 import datetime
 import json
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]]
 
 
 class CondaLister(StatelessLister[CondaListerPage]):
     """List Conda (anaconda.com) origins."""
 
     LISTER_NAME = "conda"
     VISIT_TYPE = "conda"
     INSTANCE = "conda"
     BASE_REPO_URL = "https://repo.anaconda.com/pkgs"
     REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2"
     ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}"
     ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
         url: str = BASE_REPO_URL,
         channel: str = "",
         archs: List = [],
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=url,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         self.channel: str = channel
         self.archs: List[str] = archs
         self.packages: Dict[str, Any] = defaultdict(dict)
         self.package_dates: Dict[str, Any] = defaultdict(list)
 
     def get_pages(self) -> Iterator[CondaListerPage]:
         """Yield an iterator which returns 'page'"""
 
         for arch in self.archs:
             repodata_url = self.REPO_URL_PATTERN.format(
                 url=self.url, channel=self.channel, arch=arch
             )
             response = self.http_request(url=repodata_url)
             packages: Dict[str, Any] = json.loads(bz2.decompress(response.content))[
                 "packages"
             ]
             yield (arch, packages)
 
     def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
         arch, packages = page
 
         package_names = set()
         for filename, package_metadata in packages.items():
             package_names.add(package_metadata["name"])
             version_key = (
                 f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
             )
 
             artifact: Dict[str, Any] = {
                 "filename": filename,
                 "url": self.ARCHIVE_URL_PATTERN.format(
                     url=self.url,
                     channel=self.channel,
                     filename=filename,
                     arch=arch,
                 ),
                 "version": version_key,
                 "checksums": {},
             }
 
             for checksum in ("md5", "sha256"):
                 if checksum in package_metadata:
                     artifact["checksums"][checksum] = package_metadata[checksum]
 
             self.packages[package_metadata["name"]][version_key] = artifact
 
             package_date = None
             if "timestamp" in package_metadata:
                 package_date = datetime.datetime.fromtimestamp(
                     package_metadata["timestamp"] / 1e3, datetime.timezone.utc
                 )
             elif "date" in package_metadata:
                 package_date = iso8601.parse_date(package_metadata["date"])
 
             if package_date:
                 artifact["date"] = package_date.isoformat()
                 self.package_dates[package_metadata["name"]].append(package_date)
 
         for package_name in package_names:
             package_dates = self.package_dates[package_name]
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=self.ORIGIN_URL_PATTERN.format(
                     channel=self.channel, pkgname=package_name
                 ),
                 last_update=max(package_dates, default=None),
                 extra_loader_arguments={
                     "artifacts": list(self.packages[package_name].values())
                 },
             )
diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py
index 32f7479..80669eb 100644
--- a/swh/lister/cpan/lister.py
+++ b/swh/lister/cpan/lister.py
@@ -1,199 +1,205 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
 from datetime import datetime
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Set, Union
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 CpanListerPage = Set[str]
 
 
 def get_field_value(entry, field_name):
     """
     Splits ``field_name`` on ``.``, and use it as path in the nested ``entry``
     dictionary. If a value does not exist, returns None.
 
     >>> entry = {"_source": {"foo": 1, "bar": {"baz": 2, "qux": [3]}}}
     >>> get_field_value(entry, "foo")
     1
     >>> get_field_value(entry, "bar")
     {'baz': 2, 'qux': [3]}
     >>> get_field_value(entry, "bar.baz")
     2
     >>> get_field_value(entry, "bar.qux")
     3
     """
     fields = field_name.split(".")
     field_value = entry["_source"]
     for field in fields[:-1]:
         field_value = field_value.get(field, {})
     field_value = field_value.get(fields[-1])
     # scrolled results might have field value in a list
     if isinstance(field_value, list):
         field_value = field_value[0]
     return field_value
 
 
 def get_module_version(
     module_name: str, module_version: Union[str, float, int], release_name: str
 ) -> str:
     # some old versions fail to be parsed and cpan api set version to 0
     if module_version == 0:
         prefix = f"{module_name}-"
         if release_name.startswith(prefix):
             # extract version from release name
             module_version = release_name.replace(prefix, "", 1)
     return str(module_version)
 
 
 class CpanLister(StatelessLister[CpanListerPage]):
     """The Cpan lister list origins from 'Cpan', the Comprehensive Perl Archive
     Network."""
 
     LISTER_NAME = "cpan"
     VISIT_TYPE = "cpan"
     INSTANCE = "cpan"
 
     API_BASE_URL = "https://fastapi.metacpan.org/v1"
     REQUIRED_DOC_FIELDS = [
         "download_url",
         "checksum_sha256",
         "distribution",
         "version",
     ]
     OPTIONAL_DOC_FIELDS = ["date", "author", "stat.size", "name", "metadata.author"]
     ORIGIN_URL_PATTERN = "https://metacpan.org/dist/{module_name}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.API_BASE_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
         self.module_metadata: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
         self.release_dates: Dict[str, List[datetime]] = defaultdict(list)
         self.module_names: Set[str] = set()
 
     def process_release_page(self, page: List[Dict[str, Any]]):
         for entry in page:
 
             if "_source" not in entry or not all(
                 k in entry["_source"].keys() for k in self.REQUIRED_DOC_FIELDS
             ):
                 logger.warning(
                     "Skipping release entry %s as some required fields are missing",
                     entry.get("_source"),
                 )
                 continue
 
             module_name = get_field_value(entry, "distribution")
             module_version = get_field_value(entry, "version")
             module_download_url = get_field_value(entry, "download_url")
             module_sha256_checksum = get_field_value(entry, "checksum_sha256")
             module_date = get_field_value(entry, "date")
             module_size = get_field_value(entry, "stat.size")
             module_author = get_field_value(entry, "author")
             module_author_fullname = get_field_value(entry, "metadata.author")
             release_name = get_field_value(entry, "name")
 
             module_version = get_module_version(
                 module_name, module_version, release_name
             )
 
             self.artifacts[module_name].append(
                 {
                     "url": module_download_url,
                     "filename": module_download_url.split("/")[-1],
                     "checksums": {"sha256": module_sha256_checksum},
                     "version": module_version,
                     "length": module_size,
                 }
             )
 
             self.module_metadata[module_name].append(
                 {
                     "name": module_name,
                     "version": module_version,
                     "cpan_author": module_author,
                     "author": (
                         module_author_fullname
                         if module_author_fullname not in (None, "", "unknown")
                         else module_author
                     ),
                     "date": module_date,
                     "release_name": release_name,
                 }
             )
 
             self.release_dates[module_name].append(iso8601.parse_date(module_date))
 
             self.module_names.add(module_name)
 
     def get_pages(self) -> Iterator[CpanListerPage]:
         """Yield an iterator which returns 'page'"""
 
         endpoint = f"{self.API_BASE_URL}/release/_search"
         scrollendpoint = f"{self.API_BASE_URL}/_search/scroll"
         size = 1000
 
         res = self.http_request(
             endpoint,
             params={
                 "_source": self.REQUIRED_DOC_FIELDS + self.OPTIONAL_DOC_FIELDS,
                 "size": size,
                 "scroll": "1m",
             },
         )
         data = res.json()["hits"]["hits"]
         self.process_release_page(data)
 
         _scroll_id = res.json()["_scroll_id"]
 
         while data:
             scroll_res = self.http_request(
                 scrollendpoint, params={"scroll": "1m", "scroll_id": _scroll_id}
             )
             data = scroll_res.json()["hits"]["hits"]
             _scroll_id = scroll_res.json()["_scroll_id"]
             self.process_release_page(data)
 
         yield self.module_names
 
     def get_origins_from_page(
         self, module_names: CpanListerPage
     ) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
 
         for module_name in module_names:
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=self.ORIGIN_URL_PATTERN.format(module_name=module_name),
                 last_update=max(self.release_dates[module_name]),
                 extra_loader_arguments={
                     "api_base_url": self.API_BASE_URL,
                     "artifacts": self.artifacts[module_name],
                     "module_metadata": self.module_metadata[module_name],
                 },
             )
diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py
index 35e3d2b..728c6d3 100644
--- a/swh/lister/cran/lister.py
+++ b/swh/lister/cran/lister.py
@@ -1,150 +1,159 @@
 # Copyright (C) 2019-2021 the Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from datetime import datetime, timezone
 import json
 import logging
 import subprocess
 from typing import Dict, Iterator, List, Optional, Tuple
 
 import pkg_resources
 
 from swh.lister.pattern import CredentialsType, StatelessLister
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 CRAN_MIRROR = "https://cran.r-project.org"
 
 PageType = List[Dict[str, str]]
 
 
 class CRANLister(StatelessLister[PageType]):
     """
     List all packages hosted on The Comprehensive R Archive Network.
     """
 
     LISTER_NAME = "CRAN"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
-            scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials
+            scheduler,
+            url=CRAN_MIRROR,
+            instance="cran",
+            credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
     def get_pages(self) -> Iterator[PageType]:
         """
         Yields a single page containing all CRAN packages info.
         """
         yield read_cran_data()
 
     def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
 
         seen_urls = set()
         for package_info in page:
             origin_url, artifact_url = compute_origin_urls(package_info)
 
             if origin_url in seen_urls:
                 # prevent multiple listing of an origin,
                 # most recent version will be listed first
                 continue
 
             seen_urls.add(origin_url)
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type="cran",
                 last_update=parse_packaged_date(package_info),
                 extra_loader_arguments={
                     "artifacts": [
                         {
                             "url": artifact_url,
                             "version": package_info["Version"],
                             "package": package_info["Package"],
                             "checksums": {"md5": package_info["MD5sum"]},
                         }
                     ]
                 },
             )
 
 
 def read_cran_data() -> List[Dict[str, str]]:
     """
     Runs R script which uses inbuilt API to return a json response
             containing data about the R packages.
 
     Returns:
         List of Dict about R packages. For example::
 
             [
                 {
                     'Package': 'A3',
                     'Version': '1.0.0'
                 },
                 {
                     'Package': 'abbyyR',
                     'Version': '0.5.4'
                 },
                 ...
             ]
     """
     filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R")
     logger.debug("Executing R script %s", filepath)
     response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False)
     return json.loads(response.stdout.decode("utf-8"))
 
 
 def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]:
     """Compute the package url from the repo dict.
 
     Args:
         repo: dict with key 'Package', 'Version'
 
     Returns:
         the tuple project url, artifact url
 
     """
     package = package_info["Package"]
     version = package_info["Version"]
     origin_url = f"{CRAN_MIRROR}/package={package}"
     artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz"
     return origin_url, artifact_url
 
 
 def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]:
     packaged_at_str = package_info.get("Packaged", "")
     packaged_at = None
     if packaged_at_str:
         packaged_at_str = packaged_at_str.replace(" UTC", "")
         # Packaged field possible formats:
         #   - "%Y-%m-%d %H:%M:%S[.%f] UTC; <packager>",
         #   - "%a %b %d %H:%M:%S %Y; <packager>"
         for date_format in (
             "%Y-%m-%d %H:%M:%S",
             "%Y-%m-%d %H:%M:%S.%f",
             "%a %b %d %H:%M:%S %Y",
         ):
             try:
                 packaged_at = datetime.strptime(
                     packaged_at_str.split(";")[0],
                     date_format,
                 ).replace(tzinfo=timezone.utc)
                 break
             except Exception:
                 continue
 
         if packaged_at is None:
             logger.debug(
                 "Could not parse %s package release date: %s",
                 package_info["Package"],
                 packaged_at_str,
             )
 
     return packaged_at
diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py
index eca9f10..6b8c94a 100644
--- a/swh/lister/crates/lister.py
+++ b/swh/lister/crates/lister.py
@@ -1,250 +1,256 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import csv
 from dataclasses import dataclass
 from datetime import datetime
 import json
 import logging
 from pathlib import Path
 import tarfile
 import tempfile
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urlparse
 
 import iso8601
 from packaging.version import parse as parse_version
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 CratesListerPage = List[Dict[str, Any]]
 
 
 @dataclass
 class CratesListerState:
     """Store lister state for incremental mode operations.
     'index_last_update' represents the UTC time the crates.io database dump was
     started
     """
 
     index_last_update: Optional[datetime] = None
 
 
 class CratesLister(Lister[CratesListerState, CratesListerPage]):
     """List origins from the "crates.io" forge.
 
     It downloads a tar.gz archive which contains crates.io database table content as
     csv files which is automatically generated every 24 hours.
     Parsing two csv files we can list all Crates.io package names and their related
     versions.
 
     In incremental mode, it check each entry comparing their 'last_update' value
     with self.state.index_last_update
     """
 
     LISTER_NAME = "crates"
     VISIT_TYPE = "crates"
     INSTANCE = "crates"
 
     BASE_URL = "https://crates.io"
     DB_DUMP_URL = "https://static.crates.io/db-dump.tar.gz"
 
     CRATE_FILE_URL_PATTERN = (
         "https://static.crates.io/crates/{crate}/{crate}-{version}.crate"
     )
     CRATE_URL_PATTERN = "https://crates.io/crates/{crate}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.BASE_URL,
             instance=self.INSTANCE,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         self.index_metadata: Dict[str, str] = {}
 
     def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState:
         index_last_update = d.get("index_last_update")
         if index_last_update is not None:
             d["index_last_update"] = iso8601.parse_date(index_last_update)
         return CratesListerState(**d)
 
     def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]:
         d: Dict[str, Optional[str]] = {"index_last_update": None}
         index_last_update = state.index_last_update
         if index_last_update is not None:
             d["index_last_update"] = index_last_update.isoformat()
         return d
 
     def is_new(self, dt_str: str):
         """Returns True when dt_str is greater than
         self.state.index_last_update
         """
         dt = iso8601.parse_date(dt_str)
         last = self.state.index_last_update
         return not last or (last is not None and last < dt)
 
     def get_and_parse_db_dump(self) -> Dict[str, Any]:
         """Download and parse csv files from db_dump_path.
 
         Returns a dict where each entry corresponds to a package name with its related versions.
         """
 
         with tempfile.TemporaryDirectory() as tmpdir:
 
             file_name = self.DB_DUMP_URL.split("/")[-1]
             archive_path = Path(tmpdir) / file_name
 
             # Download the Db dump
             with self.http_request(self.DB_DUMP_URL, stream=True) as res:
                 with open(archive_path, "wb") as out_file:
                     for chunk in res.iter_content(chunk_size=1024):
                         out_file.write(chunk)
 
             # Extract the Db dump
             db_dump_path = Path(str(archive_path).split(".tar.gz")[0])
             tar = tarfile.open(archive_path)
             tar.extractall(path=db_dump_path)
             tar.close()
 
             csv.field_size_limit(1000000)
 
             (crates_csv_path,) = list(db_dump_path.glob("*/data/crates.csv"))
             (versions_csv_path,) = list(db_dump_path.glob("*/data/versions.csv"))
             (index_metadata_json_path,) = list(db_dump_path.rglob("*metadata.json"))
 
             with index_metadata_json_path.open("rb") as index_metadata_json:
                 self.index_metadata = json.load(index_metadata_json)
 
             crates: Dict[str, Any] = {}
             with crates_csv_path.open() as crates_fd:
                 crates_csv = csv.DictReader(crates_fd)
                 for item in crates_csv:
                     if self.is_new(item["updated_at"]):
                         # crate 'id' as key
                         crates[item["id"]] = {
                             "name": item["name"],
                             "updated_at": item["updated_at"],
                             "versions": {},
                         }
 
             data: Dict[str, Any] = {}
             with versions_csv_path.open() as versions_fd:
                 versions_csv = csv.DictReader(versions_fd)
                 for version in versions_csv:
                     if version["crate_id"] in crates.keys():
                         crate: Dict[str, Any] = crates[version["crate_id"]]
                         crate["versions"][version["num"]] = version
                         # crate 'name' as key
                         data[crate["name"]] = crate
             return data
 
     def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]:
         """Transform package version definition dict to a suitable
         page entry dict
         """
         crate_file = self.CRATE_FILE_URL_PATTERN.format(
             crate=entry["name"], version=entry["version"]
         )
         filename = urlparse(crate_file).path.split("/")[-1]
         return dict(
             name=entry["name"],
             version=entry["version"],
             checksum=entry["checksum"],
             yanked=True if entry["yanked"] == "t" else False,
             crate_file=crate_file,
             filename=filename,
             last_update=entry["updated_at"],
         )
 
     def get_pages(self) -> Iterator[CratesListerPage]:
         """Each page is a list of crate versions with:
         - name: Name of the crate
         - version: Version
         - checksum: Checksum
         - yanked: Whether the package is yanked or not
         - crate_file: Url of the crate file
         - filename: File name of the crate file
         - last_update: Last update for that version
         """
 
         # Fetch crates.io Db dump, then Parse the data.
         dataset = self.get_and_parse_db_dump()
 
         logger.debug("Found %s crates in crates_index", len(dataset))
 
         # Each entry from dataset will correspond to a page
         for name, item in dataset.items():
             page = []
             # sort crate versions
             versions: list = sorted(item["versions"].keys(), key=parse_version)
 
             for version in versions:
                 v = item["versions"][version]
                 v["name"] = name
                 v["version"] = version
                 page.append(self.page_entry_dict(v))
 
             yield page
 
     def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all crate pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
 
         url = self.CRATE_URL_PATTERN.format(crate=page[0]["name"])
         last_update = page[0]["last_update"]
 
         artifacts = []
         crates_metadata = []
 
         for entry in page:
             # Build an artifact entry following original-artifacts-json specification
             # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json  # noqa: B950
             artifacts.append(
                 {
                     "version": entry["version"],
                     "filename": entry["filename"],
                     "url": entry["crate_file"],
                     "checksums": {
                         "sha256": entry["checksum"],
                     },
                 }
             )
 
             crates_metadata.append(
                 {
                     "version": entry["version"],
                     "yanked": entry["yanked"],
                     "last_update": entry["last_update"],
                 }
             )
 
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type=self.VISIT_TYPE,
             url=url,
             last_update=iso8601.parse_date(last_update),
             extra_loader_arguments={
                 "artifacts": artifacts,
                 "crates_metadata": crates_metadata,
             },
         )
 
     def finalize(self) -> None:
         last: datetime = iso8601.parse_date(self.index_metadata["timestamp"])
 
         if not self.state.index_last_update:
             self.state.index_last_update = last
             self.updated = True
diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py
index 940e453..23d520a 100644
--- a/swh/lister/debian/lister.py
+++ b/swh/lister/debian/lister.py
@@ -1,276 +1,282 @@
 # Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import bz2
 from collections import defaultdict
 from dataclasses import dataclass, field
 from email.utils import parsedate_to_datetime
 import gzip
 from itertools import product
 import logging
 import lzma
 import os
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple
 from urllib.parse import urljoin
 
 from debian.deb822 import Sources
 from requests.exceptions import HTTPError
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 decompressors: Dict[str, Callable[[Any], Any]] = {
     "gz": lambda f: gzip.GzipFile(fileobj=f),
     "bz2": bz2.BZ2File,
     "xz": lzma.LZMAFile,
 }
 
 Suite = str
 Component = str
 PkgName = str
 PkgVersion = str
 DebianOrigin = str
 DebianPageType = Iterator[Sources]
 
 
 @dataclass
 class DebianListerState:
     """State of debian lister"""
 
     package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
     """Dictionary mapping a package name to all the versions found during
     last listing"""
 
 
 class DebianLister(Lister[DebianListerState, DebianPageType]):
     """
     List source packages for a given debian or derivative distribution.
 
     The lister will create a snapshot for each package name from all its
     available versions.
 
     If a package snapshot is different from the last listing operation,
     it will be send to the scheduler that will create a loading task
     to archive newly found source code.
 
     Args:
         scheduler: instance of SchedulerInterface
         distribution: identifier of listed distribution (e.g. Debian, Ubuntu)
         mirror_url: debian package archives mirror URL
         suites: list of distribution suites to process
         components: list of package components to process
     """
 
     LISTER_NAME = "debian"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         distribution: str = "Debian",
         mirror_url: str = "http://deb.debian.org/debian/",
         suites: List[Suite] = ["stretch", "buster", "bullseye"],
         components: List[Component] = ["main", "contrib", "non-free"],
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url=mirror_url,
             instance=distribution,
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         # to ensure urljoin will produce valid Sources URL
         if not self.url.endswith("/"):
             self.url += "/"
 
         self.distribution = distribution
         self.suites = suites
         self.components = components
 
         # will hold all listed origins info
         self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {}
 
         # will contain the lister state after a call to run
         self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
 
     def state_from_dict(self, d: Dict[str, Any]) -> DebianListerState:
         return DebianListerState(package_versions={k: set(v) for k, v in d.items()})
 
     def state_to_dict(self, state: DebianListerState) -> Dict[str, Any]:
         return {k: list(v) for k, v in state.package_versions.items()}
 
     def debian_index_urls(
         self, suite: Suite, component: Component
     ) -> Iterator[Tuple[str, str]]:
         """Return an iterator on possible Sources file URLs as multiple compression
         formats can be used."""
         compression_exts = ("xz", "bz2", "gz")
         base_urls = [
             urljoin(self.url, f"dists/{suite}/{component}/source/Sources"),
             urljoin(self.url, f"dists/{suite}/updates/{component}/source/Sources"),
         ]
         for base_url, ext in product(base_urls, compression_exts):
             yield (f"{base_url}.{ext}", ext)
         yield (base_url, "")
 
     def page_request(self, suite: Suite, component: Component) -> DebianPageType:
         """Return parsed package Sources file for a given debian suite and component."""
         for url, compression in self.debian_index_urls(suite, component):
             try:
                 response = self.http_request(url, stream=True)
             except HTTPError:
                 pass
             else:
                 last_modified = response.headers.get("Last-Modified")
                 self.last_sources_update = (
                     parsedate_to_datetime(last_modified) if last_modified else None
                 )
                 decompressor = decompressors.get(compression)
                 if decompressor:
                     data = decompressor(response.raw).readlines()
                 else:
                     data = response.raw.readlines()
                 break
         else:
             data = ""
             logger.debug("Could not retrieve sources index for %s/%s", suite, component)
 
         return Sources.iter_paragraphs(data)
 
     def get_pages(self) -> Iterator[DebianPageType]:
         """Return an iterator on parsed debian package Sources files, one per combination
         of debian suite and component."""
         for suite, component in product(self.suites, self.components):
             logger.debug(
                 "Processing %s %s source packages info for %s component.",
                 self.instance,
                 suite,
                 component,
             )
             self.current_suite = suite
             self.current_component = component
             yield self.page_request(suite, component)
 
     def origin_url_for_package(self, package_name: PkgName) -> DebianOrigin:
         """Return the origin url for the given package"""
         return f"deb://{self.instance}/packages/{package_name}"
 
     def get_origins_from_page(self, page: DebianPageType) -> Iterator[ListedOrigin]:
         """Convert a page of debian package sources into an iterator of ListedOrigin.
 
         Please note that the returned origins correspond to packages only
         listed for the first time in order to get an accurate origins counter
         in the statistics returned by the run method of the lister.
 
         Packages already listed in another page but with different versions will
         be put in cache by the method and updated ListedOrigin objects will
         be sent to the scheduler later in the commit_page method.
 
         Indeed as multiple debian suites can be processed, a similar set of
         package names can be listed for two different package source pages,
         only their version will differ, resulting in origins counted multiple
         times in lister statistics.
         """
         assert self.lister_obj.id is not None
 
         origins_to_send = {}
 
         # iterate on each package source info
         for src_pkg in page:
             # gather package files info that will be used by the debian loader
             files: Dict[str, Dict[str, Any]] = defaultdict(dict)
             for field_ in src_pkg._multivalued_fields:
                 if field_.startswith("checksums-"):
                     sum_name = field_[len("checksums-") :]
                 else:
                     sum_name = "md5sum"
                 if field_ in src_pkg:
                     for entry in src_pkg[field_]:
                         name = entry["name"]
                         files[name]["name"] = name
                         files[name]["size"] = int(entry["size"], 10)
                         files[name][sum_name] = entry[sum_name]
                         files[name]["uri"] = os.path.join(
                             self.url, src_pkg["Directory"], name
                         )
 
             # extract package name and version
             package_name = src_pkg["Package"]
             package_version = src_pkg["Version"]
             # build origin url
             origin_url = self.origin_url_for_package(package_name)
 
             # create package version key as expected by the debian loader
             package_version_key = (
                 f"{self.current_suite}/{self.current_component}/{package_version}"
             )
 
             # this is the first time a package is listed
             if origin_url not in self.listed_origins:
                 # create a ListedOrigin object for it that can be later
                 # updated with new package versions info
                 self.listed_origins[origin_url] = ListedOrigin(
                     lister_id=self.lister_obj.id,
                     url=origin_url,
                     visit_type="deb",
                     extra_loader_arguments={"packages": {}},
                     last_update=self.last_sources_update,
                 )
                 # init set that will contain all listed package versions
                 self.package_versions[package_name] = set()
 
             # origin will be yielded at the end of that method
             origins_to_send[origin_url] = self.listed_origins[origin_url]
 
             # update package versions data in parameter that will be provided
             # to the debian loader
             self.listed_origins[origin_url].extra_loader_arguments["packages"].update(
                 {
                     package_version_key: {
                         "name": package_name,
                         "version": package_version,
                         "files": files,
                     }
                 }
             )
 
             if self.listed_origins[origin_url].last_update is None or (
                 self.last_sources_update is not None
                 and self.last_sources_update  # type: ignore
                 > self.listed_origins[origin_url].last_update
             ):
                 # update debian package last update if current processed sources index
                 # has a greater modification date
                 self.listed_origins[origin_url].last_update = self.last_sources_update
 
             # add package version key to the set of found versions
             self.package_versions[package_name].add(package_version_key)
 
             # package has already been listed during a previous listing process
             if package_name in self.state.package_versions:
                 new_versions = (
                     self.package_versions[package_name]
                     - self.state.package_versions[package_name]
                 )
                 # no new versions so far, no need to send the origin to the scheduler
                 if not new_versions:
                     origins_to_send.pop(origin_url, None)
 
         logger.debug("Found %s new packages.", len(origins_to_send))
         logger.debug(
             "Current total number of listed packages is equal to %s.",
             len(self.listed_origins),
         )
 
         yield from origins_to_send.values()
 
     def finalize(self):
         # set mapping between listed package names and versions as lister state
         self.state.package_versions = self.package_versions
         self.updated = len(self.listed_origins) > 0
diff --git a/swh/lister/fedora/__init__.py b/swh/lister/fedora/__init__.py
new file mode 100644
index 0000000..6fb3a14
--- /dev/null
+++ b/swh/lister/fedora/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2022  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+    from .lister import FedoraLister
+
+    return {
+        "lister": FedoraLister,
+        "task_modules": [f"{__name__}.tasks"],
+    }
diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py
new file mode 100644
index 0000000..34712b3
--- /dev/null
+++ b/swh/lister/fedora/lister.py
@@ -0,0 +1,265 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Set, Type
+from urllib.error import HTTPError
+from urllib.parse import urljoin
+
+import repomd
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import Lister
+
+logger = logging.getLogger(__name__)
+
+
+Release = int
+Edition = str
+PkgName = str
+PkgVersion = str
+FedoraOrigin = str
+FedoraPageType = Type[repomd.Repo]
+"""Each page is a list of packages from a given Fedora (release, edition) pair"""
+
+
+def get_editions(release: Release) -> List[Edition]:
+    """Get list of editions for a given release."""
+    # Ignore dirs that don't contain .rpm files:
+    # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue
+
+    if release < 20:
+        return ["Everything", "Fedora"]
+    elif release < 28:
+        return ["Everything", "Server", "Workstation"]
+    else:
+        return ["Everything", "Server", "Workstation", "Modular"]
+
+
+def get_last_modified(pkg: repomd.Package) -> datetime:
+    """Get timezone aware last modified time in UTC from RPM package metadata."""
+    ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build")
+    return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc)
+
+
+def get_checksums(pkg: repomd.Package) -> Dict[str, str]:
+    """Get checksums associated to rpm archive."""
+    cs = pkg._element.find("common:checksum", namespaces=repomd._ns)
+    cs_type = cs.get("type")
+    if cs_type == "sha":
+        cs_type = "sha1"
+    return {cs_type: cs.text}
+
+
+@dataclass
+class FedoraListerState:
+    """State of Fedora lister"""
+
+    package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
+    """Dictionary mapping a package name to all the versions found during
+    last listing"""
+
+
+class FedoraLister(Lister[FedoraListerState, FedoraPageType]):
+    """
+    List source packages for given Fedora releases.
+
+    The lister will create a snapshot for each package name from all its
+    available versions.
+
+    If a package snapshot is different from the last listing operation,
+    it will be sent to the scheduler that will create a loading task
+    to archive newly found source code.
+
+    Args:
+        scheduler: instance of SchedulerInterface
+        url: fedora package archives mirror URL
+        releases: list of fedora releases to process
+    """
+
+    LISTER_NAME = "fedora"
+
+    def __init__(
+        self,
+        scheduler: SchedulerInterface,
+        instance: str = "fedora",
+        url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/",
+        releases: List[Release] = [34, 35, 36],
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
+    ):
+        super().__init__(
+            scheduler=scheduler,
+            url=url,
+            instance=instance,
+            credentials={},
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
+        )
+
+        self.releases = releases
+
+        self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {}
+        "will hold all listed origins info"
+        self.origins_to_send: Set[FedoraOrigin] = set()
+        "will hold updated origins since last listing"
+        self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
+        "will contain the lister state after a call to run"
+        self.last_page = False
+
+    def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState:
+        return FedoraListerState(package_versions={k: set(v) for k, v in d.items()})
+
+    def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]:
+        return {k: list(v) for k, v in state.package_versions.items()}
+
+    def page_request(self, release: Release, edition: Edition) -> FedoraPageType:
+        """Return parsed packages for a given fedora release."""
+        index_url = urljoin(
+            self.url,
+            f"{release}/{edition}/source/SRPMS/"
+            if release < 24
+            else f"{release}/{edition}/source/tree/",
+        )
+
+        repo = repomd.load(index_url)  # throws error if no repomd.xml is not found
+        self.last_page = (
+            release == self.releases[-1] and edition == get_editions(release)[-1]
+        )
+
+        logger.debug(
+            "Fetched metadata from url: %s, found %d packages", index_url, len(repo)
+        )
+        # TODO: Extract more fields like "provides" and "requires" from *primary.xml
+        # as extrinsic metadata using the pkg._element.findtext method
+        return repo
+
+    def get_pages(self) -> Iterator[FedoraPageType]:
+        """Return an iterator on parsed fedora packages, one page per (release, edition) pair"""
+
+        for release in self.releases:
+            for edition in get_editions(release):
+                logger.debug("Listing fedora release %s edition %s", release, edition)
+                self.current_release = release
+                self.current_edition = edition
+                try:
+                    yield self.page_request(release, edition)
+                except HTTPError as http_error:
+                    if http_error.getcode() == 404:
+                        logger.debug(
+                            "No packages metadata found for fedora release %s edition %s",
+                            release,
+                            edition,
+                        )
+                        continue
+                    raise
+
+    def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin:
+        """Return the origin url for the given package"""
+        return f"https://src.fedoraproject.org/rpms/{package_name}"
+
+    def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]:
+        """Convert a page of fedora package sources into an iterator of ListedOrigin."""
+        assert self.lister_obj.id is not None
+
+        origins_to_send = set()
+
+        # iterate on each package's metadata
+        for pkg_metadata in page:
+            # extract package metadata
+            package_name = pkg_metadata.name
+            package_version = pkg_metadata.vr
+            package_version_split = package_version.split(".")
+            if package_version_split[-1].startswith("fc"):
+                # remove trailing ".fcXY" in version for the rpm loader to avoid
+                # creating multiple releases targeting same directory
+                package_version = ".".join(package_version_split[:-1])
+
+            package_build_time = get_last_modified(pkg_metadata)
+            package_download_path = pkg_metadata.location
+
+            # build origin url
+            origin_url = self.origin_url_for_package(package_name)
+            # create package version key as expected by the fedora (rpm) loader
+            package_version_key = (
+                f"fedora{self.current_release}/{self.current_edition}/"
+                f"{package_version}"
+            ).lower()
+
+            # this is the first time a package is listed
+            if origin_url not in self.listed_origins:
+                # create a ListedOrigin object for it that can be later
+                # updated with new package versions info
+                self.listed_origins[origin_url] = ListedOrigin(
+                    lister_id=self.lister_obj.id,
+                    url=origin_url,
+                    visit_type="rpm",
+                    extra_loader_arguments={"packages": {}},
+                    last_update=package_build_time,
+                )
+
+                # init set that will contain all listed package versions
+                self.package_versions[package_name] = set()
+
+            # origin will be yielded at the end of that method
+            origins_to_send.add(origin_url)
+
+            # update package metadata in parameter that will be provided
+            # to the rpm loader
+            self.listed_origins[origin_url].extra_loader_arguments["packages"][
+                package_version_key
+            ] = {
+                "name": package_name,
+                "version": package_version,
+                "url": urljoin(page.baseurl, package_download_path),
+                "buildTime": package_build_time.isoformat(),
+                "checksums": get_checksums(pkg_metadata),
+            }
+
+            last_update = self.listed_origins[origin_url].last_update
+            if last_update is not None and package_build_time > last_update:
+                self.listed_origins[origin_url].last_update = package_build_time
+
+            # add package version key to the set of found versions
+            self.package_versions[package_name].add(package_version_key)
+
+            # package has already been listed during a previous listing process
+            if package_name in self.state.package_versions:
+                new_versions = (
+                    self.package_versions[package_name]
+                    - self.state.package_versions[package_name]
+                )
+                # no new versions so far, no need to send the origin to the scheduler
+                if not new_versions:
+                    origins_to_send.remove(origin_url)
+
+        logger.debug(
+            "Found %s packages to update (new ones or packages with new versions).",
+            len(origins_to_send),
+        )
+        logger.debug(
+            "Current total number of listed packages is equal to %s.",
+            len(self.listed_origins),
+        )
+
+        # yield from origins_to_send.values()
+        self.origins_to_send.update(origins_to_send)
+
+        if self.last_page:
+            # yield listed origins when all fedora releases and editions processed
+            yield from [
+                self.listed_origins[origin_url] for origin_url in self.origins_to_send
+            ]
+
+    def finalize(self):
+        # set mapping between listed package names and versions as lister state
+        self.state.package_versions = self.package_versions
+        self.updated = len(self.listed_origins) > 0
diff --git a/swh/lister/fedora/tasks.py b/swh/lister/fedora/tasks.py
new file mode 100644
index 0000000..18c8a60
--- /dev/null
+++ b/swh/lister/fedora/tasks.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Dict
+
+from celery import shared_task
+
+from .lister import FedoraLister
+
+
+@shared_task(name=__name__ + ".FullFedoraRelister")
+def list_fedora_full(**lister_args) -> Dict[str, int]:
+    """Full update of a Fedora instance"""
+    lister = FedoraLister.from_configfile(**lister_args)
+    return lister.run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping() -> str:
+    return "OK"
diff --git a/swh/lister/fedora/tests/__init__.py b/swh/lister/fedora/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz
new file mode 100644
index 0000000..0c8eac9
Binary files /dev/null and b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz differ
diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz
new file mode 100644
index 0000000..95ea3a0
Binary files /dev/null and b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz differ
diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz
new file mode 100644
index 0000000..3d9afb7
Binary files /dev/null and b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz differ
diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml
new file mode 100644
index 0000000..ab786c1
--- /dev/null
+++ b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
+  <revision>1499286311</revision>
+  <data type="primary">
+    <checksum type="sha256">4f677623c24912d86848f86837d398979b5adc2a51d9a2170f11fe42a257f3d3</checksum>
+    <open-checksum type="sha256">db616ad8e4219e23dfc05cd515e017cdc0d59144689ac606951fa42cbb06ae65</open-checksum>
+    <location href="repodata/4f677623c24912d86848f86837d398979b5adc2a51d9a2170f11fe42a257f3d3-primary.xml.gz"/>
+    <timestamp>1499286305</timestamp>
+    <size>5425131</size>
+    <open-size>30064034</open-size>
+  </data>
+  <data type="filelists">
+    <checksum type="sha256">17296af99a4b80bc67fccabe71ecefa02b76e8409372d936c054b8c9de312b6c</checksum>
+    <open-checksum type="sha256">7caabd1205a72d26422756211dcd536336cef643f7f73eb15a470b02ff09a194</open-checksum>
+    <location href="repodata/17296af99a4b80bc67fccabe71ecefa02b76e8409372d936c054b8c9de312b6c-filelists.xml.gz"/>
+    <timestamp>1499286305</timestamp>
+    <size>1650273</size>
+    <open-size>6419422</open-size>
+  </data>
+  <data type="other">
+    <checksum type="sha256">8f1ed139aeaa57f5bc280ce97b82f690e4008c122b4793791ca18e513268b6eb</checksum>
+    <open-checksum type="sha256">786b8d4fa759f0ade3eaab1bde390d12c950dfe217eda1773400f3a3d461522b</open-checksum>
+    <location href="repodata/8f1ed139aeaa57f5bc280ce97b82f690e4008c122b4793791ca18e513268b6eb-other.xml.gz"/>
+    <timestamp>1499286305</timestamp>
+    <size>4396102</size>
+    <open-size>33165783</open-size>
+  </data>
+  <data type="primary_db">
+    <checksum type="sha256">1d2c0be48c35e55669b410cb4dbe767ae4850b4c610e95ca9aee67f7eb31e457</checksum>
+    <open-checksum type="sha256">dc8dbac072ac1412f0ecface57fa57c5ddcac14acc880fe9b467164be733e963</open-checksum>
+    <location href="repodata/1d2c0be48c35e55669b410cb4dbe767ae4850b4c610e95ca9aee67f7eb31e457-primary.sqlite.bz2"/>
+    <timestamp>1499286309</timestamp>
+    <size>7071217</size>
+    <open-size>26177536</open-size>
+    <database_version>10</database_version>
+  </data>
+  <data type="filelists_db">
+    <checksum type="sha256">5e1259759b9bedefc1ff14b81760524841402776e6c1b33014f4f5d6feb40d11</checksum>
+    <open-checksum type="sha256">b293d51dd4e6eb4128e40b6ce228c62b169b1d47be535e56f69b8ad622c4a6ca</open-checksum>
+    <location href="repodata/5e1259759b9bedefc1ff14b81760524841402776e6c1b33014f4f5d6feb40d11-filelists.sqlite.bz2"/>
+    <timestamp>1499286307</timestamp>
+    <size>2227395</size>
+    <open-size>5529600</open-size>
+    <database_version>10</database_version>
+  </data>
+  <data type="other_db">
+    <checksum type="sha256">f6b30bdfe96d2137542704288de1345c01ea14397eb187126d4474648bad5292</checksum>
+    <open-checksum type="sha256">3f5d4619dcabe945b773c1c98ea40b8ead53340291bd504ab3faabfc7b57bb99</open-checksum>
+    <location href="repodata/f6b30bdfe96d2137542704288de1345c01ea14397eb187126d4474648bad5292-other.sqlite.bz2"/>
+    <timestamp>1499286311</timestamp>
+    <size>5264843</size>
+    <open-size>27930624</open-size>
+    <database_version>10</database_version>
+  </data>
+</repomd>
diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml
new file mode 100644
index 0000000..12a9a76
--- /dev/null
+++ b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
+  <revision>1651698851</revision>
+  <data type="primary">
+    <checksum type="sha256">42155056c6d7b1f0e5437bb2a92c48e6d21a02ee8f09acc726e705c26e960a3c</checksum>
+    <open-checksum type="sha256">a5841e7086be579d58e2dbb7628caebba32d9defa85739455d518bfaf90e39b0</open-checksum>
+    <location href="repodata/42155056c6d7b1f0e5437bb2a92c48e6d21a02ee8f09acc726e705c26e960a3c-primary.xml.gz"/>
+    <timestamp>1651698827</timestamp>
+    <size>7144060</size>
+    <open-size>45898728</open-size>
+  </data>
+  <data type="filelists">
+    <checksum type="sha256">fc915adcdf5710f9f80dfffcec8f03088f09cf80fbc9c801d5a8f45f1f31bb92</checksum>
+    <open-checksum type="sha256">a96a4739268e250e3c3461da716472503ed5ed8b27161fec9a143d4a8ccf5767</open-checksum>
+    <location href="repodata/fc915adcdf5710f9f80dfffcec8f03088f09cf80fbc9c801d5a8f45f1f31bb92-filelists.xml.gz"/>
+    <timestamp>1651698827</timestamp>
+    <size>1934835</size>
+    <open-size>7458268</open-size>
+  </data>
+  <data type="other">
+    <checksum type="sha256">461db9fa87e564d75d74c0dfbf006ea5d18ed646d4cb8dee1c69a4d95dd08d09</checksum>
+    <open-checksum type="sha256">1733c3011a0323fadac711dd25176c9934698176605c3e516b6aabb9b5775e00</open-checksum>
+    <location href="repodata/461db9fa87e564d75d74c0dfbf006ea5d18ed646d4cb8dee1c69a4d95dd08d09-other.xml.gz"/>
+    <timestamp>1651698827</timestamp>
+    <size>3779969</size>
+    <open-size>33166564</open-size>
+  </data>
+  <data type="primary_db">
+    <checksum type="sha256">ac60dd254bfc7557eb646a116bf8083b49fee8e942e1ef50dff7f74004897e74</checksum>
+    <open-checksum type="sha256">c752f5132f2cc5f4f137dade787154316f9503ae816212b8fabf5733cc2d344d</open-checksum>
+    <location href="repodata/ac60dd254bfc7557eb646a116bf8083b49fee8e942e1ef50dff7f74004897e74-primary.sqlite.xz"/>
+    <timestamp>1651698851</timestamp>
+    <size>9058624</size>
+    <open-size>41562112</open-size>
+    <database_version>10</database_version>
+  </data>
+  <data type="filelists_db">
+    <checksum type="sha256">1a279b88531d9c2e24c0bfc9a0d6b4357d70301c24fa42f649c726ed1af1d6a8</checksum>
+    <open-checksum type="sha256">e9b5c17e6004a78d20146aa54fa5ac93a01f4f2a95117588d649e92cfc008473</open-checksum>
+    <location href="repodata/1a279b88531d9c2e24c0bfc9a0d6b4357d70301c24fa42f649c726ed1af1d6a8-filelists.sqlite.xz"/>
+    <timestamp>1651698834</timestamp>
+    <size>1809496</size>
+    <open-size>6471680</open-size>
+    <database_version>10</database_version>
+  </data>
+  <data type="other_db">
+    <checksum type="sha256">850ad17efdebe5f9ccbef03c8aec4e7589bb6a1ca9a6249578968d60ad094a4f</checksum>
+    <open-checksum type="sha256">d13c6da8f7ad2c9060fd5b811b86facc9e926ec9273c0e135c4fe1110f784cdc</open-checksum>
+    <location href="repodata/850ad17efdebe5f9ccbef03c8aec4e7589bb6a1ca9a6249578968d60ad094a4f-other.sqlite.xz"/>
+    <timestamp>1651698838</timestamp>
+    <size>4285108</size>
+    <open-size>27897856</open-size>
+    <database_version>10</database_version>
+  </data>
+  <data type="primary_zck">
+    <checksum type="sha256">fc4205cf1cca7f0c157d1aa9a1348a1742ca7df671fbf7ccccd79221d473145b</checksum>
+    <open-checksum type="sha256">a5841e7086be579d58e2dbb7628caebba32d9defa85739455d518bfaf90e39b0</open-checksum>
+    <header-checksum type="sha256">2074f3da25ad0d45cf2776ad35dd22a6c63fafff319143c2f7dfefa98b99d651</header-checksum>
+    <location href="repodata/fc4205cf1cca7f0c157d1aa9a1348a1742ca7df671fbf7ccccd79221d473145b-primary.xml.zck"/>
+    <timestamp>1651698828</timestamp>
+    <size>6030441</size>
+    <open-size>45898728</open-size>
+    <header-size>231</header-size>
+  </data>
+  <data type="filelists_zck">
+    <checksum type="sha256">6c77673bb8823bf04fd4520c421fd0fc84567db9f23b8aa19f600b0688e46dd9</checksum>
+    <open-checksum type="sha256">a96a4739268e250e3c3461da716472503ed5ed8b27161fec9a143d4a8ccf5767</open-checksum>
+    <header-checksum type="sha256">55fc5e75acd903f01cf18328fec9c6f995bd8f80c5b085aa3e0fe116bb89e891</header-checksum>
+    <location href="repodata/6c77673bb8823bf04fd4520c421fd0fc84567db9f23b8aa19f600b0688e46dd9-filelists.xml.zck"/>
+    <timestamp>1651698829</timestamp>
+    <size>1735208</size>
+    <open-size>7458268</open-size>
+    <header-size>136</header-size>
+  </data>
+  <data type="other_zck">
+    <checksum type="sha256">c87c1b085ef287ba69b1f244d3fff56fc5efc01ffd1d7c10ee22328117651cd5</checksum>
+    <open-checksum type="sha256">1733c3011a0323fadac711dd25176c9934698176605c3e516b6aabb9b5775e00</open-checksum>
+    <header-checksum type="sha256">93624d227c24ff4eb2332fcb038e7157e08ed051b654820def75c5511a1ce191</header-checksum>
+    <location href="repodata/c87c1b085ef287ba69b1f244d3fff56fc5efc01ffd1d7c10ee22328117651cd5-other.xml.zck"/>
+    <timestamp>1651698829</timestamp>
+    <size>3019451</size>
+    <open-size>33166564</open-size>
+    <header-size>206</header-size>
+  </data>
+</repomd>
diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py
new file mode 100644
index 0000000..dc09359
--- /dev/null
+++ b/swh/lister/fedora/tests/test_lister.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2022  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from io import StringIO
+from pathlib import Path
+from typing import List
+from unittest.mock import MagicMock
+from urllib.error import HTTPError
+
+import pytest
+
+from swh.lister.fedora.lister import FedoraLister, Release, get_editions
+from swh.scheduler.interface import SchedulerInterface
+
+
+def mock_repomd(datadir, mocker, use_altered_fedora36=False):
+    """Mocks the .xml files fetched by repomd for the next lister run"""
+    paths = ["repomd26.xml", "primary26.xml.gz", "repomd36.xml", "primary36.xml.gz"]
+    if use_altered_fedora36:
+        paths[3] = "primary36-altered.xml.gz"
+
+    cm = MagicMock()
+    cm.read.side_effect = [
+        Path(datadir, "archives.fedoraproject.org", path).read_bytes() for path in paths
+    ]
+    cm.__enter__.return_value = cm
+    mocker.patch("repomd.urllib.request.urlopen").return_value = cm
+
+
+def rpm_url(release, path):
+    return (
+        "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/"
+        f"{release}/Everything/source/tree/Packages/{path}"
+    )
+
+
+@pytest.fixture
+def pkg_versions():
+    return {
+        "https://src.fedoraproject.org/rpms/0install": {
+            "fedora26/everything/2.11-4": {
+                "name": "0install",
+                "version": "2.11-4",
+                "buildTime": "2017-02-10T04:59:31+00:00",
+                "url": rpm_url(26, "0/0install-2.11-4.fc26.src.rpm"),
+                "checksums": {
+                    # note: we intentionally altered the original
+                    # primary26.xml file to test sha1 usage
+                    "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b",
+                },
+            }
+        },
+        "https://src.fedoraproject.org/rpms/0xFFFF": {
+            "fedora26/everything/0.3.9-15": {
+                "name": "0xFFFF",
+                "version": "0.3.9-15",
+                "buildTime": "2017-02-10T05:01:53+00:00",
+                "url": rpm_url(26, "0/0xFFFF-0.3.9-15.fc26.src.rpm"),
+                "checksums": {
+                    "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f"
+                },
+            },
+            "fedora36/everything/0.9-4": {
+                "name": "0xFFFF",
+                "version": "0.9-4",
+                "buildTime": "2022-01-19T19:13:53+00:00",
+                "url": rpm_url(36, "0/0xFFFF-0.9-4.fc36.src.rpm"),
+                "checksums": {
+                    "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
+                },
+            },
+        },
+        "https://src.fedoraproject.org/rpms/2ping": {
+            "fedora36/everything/4.5.1-2": {
+                "name": "2ping",
+                "version": "4.5.1-2",
+                "buildTime": "2022-01-19T19:12:21+00:00",
+                "url": rpm_url(36, "2/2ping-4.5.1-2.fc36.src.rpm"),
+                "checksums": {
+                    "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28"
+                },
+            }
+        },
+    }
+
+
+def run_lister(
+    swh_scheduler: SchedulerInterface,
+    releases: List[Release],
+    pkg_versions: dict,
+    origin_count: int,
+    updated: bool = True,
+):
+    """Runs the lister and tests that the listed origins are correct."""
+    lister = FedoraLister(scheduler=swh_scheduler, releases=releases)
+
+    stats = lister.run()
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+    lister_state = lister.get_state_from_scheduler()
+    state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()}
+
+    # One edition from each release (we mocked get_editions)
+    assert stats.pages == (len(releases) if updated else 0)
+    assert stats.origins == origin_count
+
+    assert {
+        o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins
+    } == pkg_versions
+
+    assert lister_state.package_versions == state_pkg_versions
+    assert lister.updated == updated
+
+
+def test_get_editions():
+    assert get_editions(18) == ["Everything", "Fedora"]
+    assert get_editions(26) == ["Everything", "Server", "Workstation"]
+    assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"]
+
+
+@pytest.mark.parametrize("status_code", [400, 404, 500])
+def test_fedora_lister_http_error(
+    swh_scheduler: SchedulerInterface, mocker: MagicMock, status_code: int
+):
+    """
+    Simulates handling of HTTP Errors while fetching of packages for fedora releases.
+    """
+    releases = [18]
+
+    is_404 = status_code == 404
+
+    def side_effect(url):
+        if is_404:
+            raise HTTPError(
+                url, status_code, "Not Found", {"content-type": "text/html"}, StringIO()
+            )
+        else:
+            raise HTTPError(
+                url,
+                status_code,
+                "Internal server error",
+                {"content-type": "text/html"},
+                StringIO(),
+            )
+
+    urlopen_patch = mocker.patch("repomd.urllib.request.urlopen")
+    urlopen_patch.side_effect = side_effect
+
+    expected_pkgs: dict = {}
+
+    if is_404:
+        run_lister(
+            swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False
+        )
+    else:
+        with pytest.raises(HTTPError):
+            run_lister(
+                swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False
+            )
+
+
+def test_full_lister_fedora(
+    swh_scheduler: SchedulerInterface,
+    mocker: MagicMock,
+    datadir: Path,
+    pkg_versions: dict,
+):
+    """
+    Simulates a full listing of packages for fedora releases.
+    """
+    releases = [26, 36]
+
+    get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions")
+    get_editions_patch.return_value = ["Everything"]
+
+    mock_repomd(datadir, mocker)
+    run_lister(swh_scheduler, releases, pkg_versions, origin_count=3)
+
+
+def test_incremental_lister(
+    swh_scheduler: SchedulerInterface,
+    mocker: MagicMock,
+    datadir: Path,
+    pkg_versions: dict,
+):
+    """
+    Simulates an incremental listing of packages for fedora releases.
+    """
+    releases = [26, 36]
+
+    get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions")
+    get_editions_patch.return_value = ["Everything"]
+
+    # First run
+    mock_repomd(datadir, mocker)
+    run_lister(swh_scheduler, releases, pkg_versions, origin_count=3)
+    # Second run (no updates)
+    mock_repomd(datadir, mocker)
+    run_lister(swh_scheduler, releases, pkg_versions, origin_count=0)
+
+    # Use an altered version of primary36.xml in which we updated the version
+    # of package 0xFFFF to 0.10:
+    mock_repomd(datadir, mocker, use_altered_fedora36=True)
+    # Add new version to the set of expected pkg versions:
+    pkg_versions["https://src.fedoraproject.org/rpms/0xFFFF"].update(
+        {
+            "fedora36/everything/0.10-4": {
+                "name": "0xFFFF",
+                "version": "0.10-4",
+                "buildTime": "2022-01-19T19:13:53+00:00",
+                "url": rpm_url(36, "0/0xFFFF-0.10-4.fc36.src.rpm"),
+                "checksums": {
+                    "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
+                },
+            }
+        }
+    )
+
+    # Third run (0xFFFF in fedora36 editions got updated and it needs to be listed)
+    run_lister(swh_scheduler, releases, pkg_versions, origin_count=1)
diff --git a/swh/lister/fedora/tests/test_tasks.py b/swh/lister/fedora/tests/test_tasks.py
new file mode 100644
index 0000000..7fd4236
--- /dev/null
+++ b/swh/lister/fedora/tests/test_tasks.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2022  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from unittest.mock import patch
+
+from swh.lister.pattern import ListerStats
+
+
+def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+    res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping")
+    assert res
+    res.wait()
+    assert res.successful()
+    assert res.result == "OK"
+
+
+@patch("swh.lister.fedora.tasks.FedoraLister")
+def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
+    lister.from_configfile.return_value = lister
+    lister.run.return_value = ListerStats(pages=10, origins=500)
+
+    kwargs = dict(
+        url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/"
+    )
+    res = swh_scheduler_celery_app.send_task(
+        "swh.lister.fedora.tasks.FullFedoraRelister",
+        kwargs=kwargs,
+    )
+    assert res
+    res.wait()
+    assert res.successful()
+
+    lister.from_configfile.assert_called_once_with(**kwargs)
+    lister.run.assert_called_once_with()
+
+
+@patch("swh.lister.fedora.tasks.FedoraLister")
+def test_full_listing_params(
+    lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
+):
+    lister.from_configfile.return_value = lister
+    lister.run.return_value = ListerStats(pages=10, origins=500)
+
+    kwargs = dict(
+        url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/",
+        instance="archives.fedoraproject.org",
+        releases=["36"],
+    )
+    res = swh_scheduler_celery_app.send_task(
+        "swh.lister.fedora.tasks.FullFedoraRelister",
+        kwargs=kwargs,
+    )
+    assert res
+    res.wait()
+    assert res.successful()
+
+    lister.from_configfile.assert_called_once_with(**kwargs)
+    lister.run.assert_called_once_with()
diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py
index 5728727..738c516 100644
--- a/swh/lister/github/lister.py
+++ b/swh/lister/github/lister.py
@@ -1,205 +1,211 @@
 # Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import datetime
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Set
 from urllib.parse import parse_qs, urlparse
 
 import iso8601
 
 from swh.core.github.utils import MissingRateLimitReset
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class GitHubListerState:
     """State of the GitHub lister"""
 
     last_seen_id: int = 0
     """Numeric id of the last repository listed on an incremental pass"""
 
 
 class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
     """List origins from GitHub.
 
     By default, the lister runs in incremental mode: it lists all repositories,
     starting with the `last_seen_id` stored in the scheduler backend.
 
     Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in
     that mode, the lister finds the origins present in the range **excluding**
     `first_id` and **including** `last_id`. In this mode, the lister can overrun the
     `last_id`: it will always record all the origins seen in a given page. As the lister
     is fully idempotent, this is not a practical problem. Once relisting completes, the
     lister state in the scheduler backend is not updated.
 
     When the config contains a set of credentials, we shuffle this list at the beginning
     of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the
     same token over and over again, until its rate limit runs out. Once that happens, we
     switch to the next token over in our shuffled list.
 
     When a request fails with a rate limit exception for all tokens, we pause the
     listing until the largest value for X-Ratelimit-Reset over all tokens.
 
     When the credentials aren't set in the lister config, the lister can run in
     anonymous mode too (e.g. for testing purposes).
 
     .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
 
 
     Args:
       first_id: the id of the first repo to list
       last_id: stop listing after seeing a repo with an id higher than this value.
 
     """  # noqa: B950
 
     LISTER_NAME = "github"
 
     API_URL = "https://api.github.com/repositories"
     PAGE_SIZE = 1000
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         first_id: Optional[int] = None,
         last_id: Optional[int] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.API_URL,
             instance="github",
             with_github_session=True,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.first_id = first_id
         self.last_id = last_id
 
         self.relisting = self.first_id is not None or self.last_id is not None
 
     def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
         return GitHubListerState(**d)
 
     def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
         current_id = 0
         if self.first_id is not None:
             current_id = self.first_id
         elif self.state is not None:
             current_id = self.state.last_seen_id
 
         current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}"
 
         while self.last_id is None or current_id < self.last_id:
             logger.debug("Getting page %s", current_url)
 
             try:
                 assert self.github_session is not None
                 response = self.github_session.request(current_url)
             except MissingRateLimitReset:
                 # Give up
                 break
 
             # We've successfully retrieved a (non-ratelimited) `response`. We
             # still need to check it for validity.
 
             if response.status_code != 200:
                 logger.warning(
                     "Got unexpected status_code %s: %s",
                     response.status_code,
                     response.content,
                 )
                 break
 
             yield response.json()
 
             if "next" not in response.links:
                 # No `next` link, we've reached the end of the world
                 logger.debug(
                     "No next link found in the response headers, all caught up"
                 )
                 break
 
             # GitHub strongly advises to use the next link directly. We still
             # parse it to get the id of the last repository we've reached so
             # far.
             next_url = response.links["next"]["url"]
             parsed_url = urlparse(next_url)
             if not parsed_url.query:
                 logger.warning("Failed to parse url %s", next_url)
                 break
 
             parsed_query = parse_qs(parsed_url.query)
             current_id = int(parsed_query["since"][0])
             current_url = next_url
 
     def get_origins_from_page(
         self, page: List[Dict[str, Any]]
     ) -> Iterator[ListedOrigin]:
         """Convert a page of GitHub repositories into a list of ListedOrigins.
 
         This records the html_url, as well as the pushed_at value if it exists.
         """
         assert self.lister_obj.id is not None
 
         seen_in_page: Set[str] = set()
 
         for repo in page:
             if not repo:
                 # null repositories in listings happen sometimes...
                 continue
 
             if repo["html_url"] in seen_in_page:
                 continue
             seen_in_page.add(repo["html_url"])
 
             pushed_at_str = repo.get("pushed_at")
             pushed_at: Optional[datetime.datetime] = None
             if pushed_at_str:
                 pushed_at = iso8601.parse_date(pushed_at_str)
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=repo["html_url"],
                 visit_type="git",
                 last_update=pushed_at,
             )
 
     def commit_page(self, page: List[Dict[str, Any]]):
         """Update the currently stored state using the latest listed page"""
         if self.relisting:
             # Don't update internal state when relisting
             return
 
         if not page:
             # Sometimes, when you reach the end of the world, GitHub returns an empty
             # page of repositories
             return
 
         last_id = page[-1]["id"]
 
         if last_id > self.state.last_seen_id:
             self.state.last_seen_id = last_id
 
     def finalize(self):
         if self.relisting:
             return
 
         # Pull fresh lister state from the scheduler backend
         scheduler_state = self.get_state_from_scheduler()
 
         # Update the lister state in the backend only if the last seen id of
         # the current run is higher than that stored in the database.
         if self.state.last_seen_id > scheduler_state.last_seen_id:
             self.updated = True
diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py
index f57b7e2..3ad2bfd 100644
--- a/swh/lister/gitlab/lister.py
+++ b/swh/lister/gitlab/lister.py
@@ -1,260 +1,275 @@
 # Copyright (C) 2018-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import logging
 import random
-from typing import Any, Dict, Iterator, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple
 from urllib.parse import parse_qs, urlencode, urlparse
 
 import iso8601
 from requests.exceptions import HTTPError
 from requests.status_codes import codes
 from tenacity.before_sleep import before_sleep_log
 
 from swh.lister.pattern import CredentialsType, Lister
 from swh.lister.utils import http_retry, is_retryable_exception
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 
 # Some instance provides hg_git type which can be ingested as hg origins
 VCS_MAPPING = {"hg_git": "hg"}
 
 
 @dataclass
 class GitLabListerState:
     """State of the GitLabLister"""
 
     last_seen_next_link: Optional[str] = None
     """Last link header (not visited yet) during an incremental pass
 
     """
 
 
 Repository = Dict[str, Any]
 
 
 @dataclass
 class PageResult:
     """Result from a query to a gitlab project api page."""
 
     repositories: Optional[Tuple[Repository, ...]] = None
     next_page: Optional[str] = None
 
 
 def _if_rate_limited(retry_state) -> bool:
     """Custom tenacity retry predicate for handling HTTP responses with status code 403
     with specific ratelimit header.
 
     """
     attempt = retry_state.outcome
     if attempt.failed:
         exc = attempt.exception()
         return (
             isinstance(exc, HTTPError)
             and exc.response.status_code == codes.forbidden
             and int(exc.response.headers.get("RateLimit-Remaining", "0")) == 0
         ) or is_retryable_exception(exc)
     return False
 
 
 def _parse_id_after(url: Optional[str]) -> Optional[int]:
     """Given an url, extract a return the 'id_after' query parameter associated value
     or None.
 
     This is the the repository id used for pagination purposes.
 
     """
     if not url:
         return None
     # link: https://${project-api}/?...&id_after=2x...
     query_data = parse_qs(urlparse(url).query)
     page = query_data.get("id_after")
     if page and len(page) > 0:
         return int(page[0])
     return None
 
 
 class GitLabLister(Lister[GitLabListerState, PageResult]):
     """List origins for a gitlab instance.
 
     By default, the lister runs in incremental mode: it lists all repositories,
     starting with the `last_seen_next_link` stored in the scheduler backend.
 
     Args:
         scheduler: a scheduler instance
         url: the api v4 url of the gitlab instance to visit (e.g.
           https://gitlab.com/api/v4/)
         instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...),
             url network location will be used if not provided
         incremental: defines if incremental listing is activated or not
+        ignored_project_prefixes: List of prefixes of project paths to ignore
 
     """
 
     def __init__(
         self,
         scheduler,
         url: str,
         name: Optional[str] = "gitlab",
         instance: Optional[str] = None,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         incremental: bool = False,
+        ignored_project_prefixes: Optional[List[str]] = None,
     ):
         if name is not None:
             self.LISTER_NAME = name
         super().__init__(
             scheduler=scheduler,
             url=url.rstrip("/"),
             instance=instance,
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         self.incremental = incremental
         self.last_page: Optional[str] = None
         self.per_page = 100
+        self.ignored_project_prefixes: Optional[Tuple[str, ...]] = None
+        if ignored_project_prefixes:
+            self.ignored_project_prefixes = tuple(ignored_project_prefixes)
 
         self.session.headers.update({"Accept": "application/json"})
 
         if len(self.credentials) > 0:
             cred = random.choice(self.credentials)
             logger.info(
                 "Using %s credentials from user %s", self.instance, cred["username"]
             )
             api_token = cred["password"]
             if api_token:
                 self.session.headers["Authorization"] = f"Bearer {api_token}"
 
     def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState:
         return GitLabListerState(**d)
 
     def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
         return asdict(state)
 
     @http_retry(
         retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
     )
     def get_page_result(self, url: str) -> PageResult:
         logger.debug("Fetching URL %s", url)
         response = self.session.get(url)
         if response.status_code != 200:
             logger.warning(
                 "Unexpected HTTP status code %s on %s: %s",
                 response.status_code,
                 response.url,
                 response.content,
             )
 
         # GitLab API can return errors 500 when listing projects.
         # https://gitlab.com/gitlab-org/gitlab/-/issues/262629
         # To avoid ending the listing prematurely, skip buggy URLs and move
         # to next pages.
         if response.status_code == 500:
             id_after = _parse_id_after(url)
             assert id_after is not None
             while True:
                 next_id_after = id_after + self.per_page
                 url = url.replace(f"id_after={id_after}", f"id_after={next_id_after}")
                 response = self.session.get(url)
                 if response.status_code == 200:
                     break
                 else:
                     id_after = next_id_after
         else:
             response.raise_for_status()
 
         repositories: Tuple[Repository, ...] = tuple(response.json())
         if hasattr(response, "links") and response.links.get("next"):
             next_page = response.links["next"]["url"]
         else:
             next_page = None
 
         return PageResult(repositories, next_page)
 
     def page_url(self, id_after: Optional[int] = None) -> str:
         parameters = {
             "pagination": "keyset",
             "order_by": "id",
             "sort": "asc",
             "simple": "true",
             "per_page": f"{self.per_page}",
         }
         if id_after is not None:
             parameters["id_after"] = str(id_after)
         return f"{self.url}/projects?{urlencode(parameters)}"
 
     def get_pages(self) -> Iterator[PageResult]:
         next_page: Optional[str]
         if self.incremental and self.state and self.state.last_seen_next_link:
             next_page = self.state.last_seen_next_link
         else:
             next_page = self.page_url()
 
         while next_page:
             self.last_page = next_page
             page_result = self.get_page_result(next_page)
             yield page_result
             next_page = page_result.next_page
 
     def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
 
         repositories = page_result.repositories if page_result.repositories else []
         for repo in repositories:
+            if self.ignored_project_prefixes and repo["path_with_namespace"].startswith(
+                self.ignored_project_prefixes
+            ):
+                continue
             visit_type = repo.get("vcs_type", "git")
             visit_type = VCS_MAPPING.get(visit_type, visit_type)
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=repo["http_url_to_repo"],
                 visit_type=visit_type,
                 last_update=iso8601.parse_date(repo["last_activity_at"]),
             )
 
     def commit_page(self, page_result: PageResult) -> None:
         """Update currently stored state using the latest listed "next" page if relevant.
 
         Relevancy is determined by the next_page link whose 'page' id must be strictly
         superior to the currently stored one.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental:
             # link: https://${project-api}/?...&page=2x...
             next_page = page_result.next_page
             if not next_page and self.last_page:
                 next_page = self.last_page
 
             if next_page:
                 id_after = _parse_id_after(next_page)
                 previous_next_page = self.state.last_seen_next_link
                 previous_id_after = _parse_id_after(previous_next_page)
 
                 if previous_next_page is None or (
                     previous_id_after and id_after and previous_id_after < id_after
                 ):
                     self.state.last_seen_next_link = next_page
 
     def finalize(self) -> None:
         """finalize the lister state when relevant (see `fn:commit_page` for details)
 
         Note: this is a noop for full listing mode
 
         """
         next_page = self.state.last_seen_next_link
         if self.incremental and next_page:
             # link: https://${project-api}/?...&page=2x...
             next_id_after = _parse_id_after(next_page)
             scheduler_state = self.get_state_from_scheduler()
             previous_next_id_after = _parse_id_after(
                 scheduler_state.last_seen_next_link
             )
 
             if (not previous_next_id_after and next_id_after) or (
                 previous_next_id_after
                 and next_id_after
                 and previous_next_id_after < next_id_after
             ):
                 self.updated = True
diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py
index 6bbffcd..5b80963 100644
--- a/swh/lister/gitlab/tests/test_lister.py
+++ b/swh/lister/gitlab/tests/test_lister.py
@@ -1,358 +1,393 @@
 # Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import functools
 import json
 import logging
 from pathlib import Path
 from typing import Dict, List
 
 import pytest
 from requests.status_codes import codes
 
 from swh.lister import USER_AGENT_TEMPLATE
 from swh.lister.gitlab.lister import GitLabLister, _parse_id_after
 from swh.lister.pattern import ListerStats
 from swh.lister.tests.test_utils import assert_sleep_calls
 from swh.lister.utils import WAIT_EXP_BASE
 
 logger = logging.getLogger(__name__)
 
 
 def api_url(instance: str) -> str:
     return f"https://{instance}/api/v4/"
 
 
 def _match_request(request, lister_name="gitlab"):
     return request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % lister_name
 
 
 def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
     """Gitlab lister supports full listing"""
     instance = "gitlab.com"
     lister = GitLabLister(swh_scheduler, url=api_url(instance), instance=instance)
 
     response = gitlab_page_response(datadir, instance, 1)
 
     requests_mock.get(
         lister.page_url(),
         [{"json": response}],
         additional_matcher=_match_request,
     )
 
     listed_result = lister.run()
     expected_nb_origins = len(response)
     assert listed_result == ListerStats(pages=1, origins=expected_nb_origins)
 
     scheduler_origins = lister.scheduler.get_listed_origins(
         lister.lister_obj.id
     ).results
     assert len(scheduler_origins) == expected_nb_origins
 
     for listed_origin in scheduler_origins:
         assert listed_origin.visit_type == "git"
         assert listed_origin.url.startswith(f"https://{instance}")
         assert listed_origin.last_update is not None
 
 
 def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock):
     """Heptapod lister happily lists hg, hg_git as hg and git origins"""
     name = "heptapod"
     instance = "foss.heptapod.net"
     lister = GitLabLister(
         swh_scheduler, url=api_url(instance), name=name, instance=instance
     )
     assert lister.LISTER_NAME == name
 
     response = gitlab_page_response(datadir, instance, 1)
 
     requests_mock.get(
         lister.page_url(),
         [{"json": response}],
         additional_matcher=functools.partial(_match_request, lister_name="heptapod"),
     )
 
     listed_result = lister.run()
     expected_nb_origins = len(response)
 
     for entry in response:
         assert entry["vcs_type"] in ("hg", "hg_git")
 
     assert listed_result == ListerStats(pages=1, origins=expected_nb_origins)
 
     scheduler_origins = lister.scheduler.get_listed_origins(
         lister.lister_obj.id
     ).results
     assert len(scheduler_origins) == expected_nb_origins
 
     for listed_origin in scheduler_origins:
         assert listed_origin.visit_type == "hg"
         assert listed_origin.url.startswith(f"https://{instance}")
         assert listed_origin.last_update is not None
 
 
 def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]:
     """Return list of repositories (out of test dataset)"""
     datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json")
     return json.loads(datapath.read_text()) if datapath.exists else []
 
 
 def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir):
     """Gitlab lister supports pagination"""
     instance = "gite.lirmm.fr"
     lister = GitLabLister(swh_scheduler, url=api_url(instance))
 
     response1 = gitlab_page_response(datadir, instance, 1)
     response2 = gitlab_page_response(datadir, instance, 2)
 
     requests_mock.get(
         lister.page_url(),
         [{"json": response1, "headers": {"Link": f"<{lister.page_url(2)}>; rel=next"}}],
         additional_matcher=_match_request,
     )
 
     requests_mock.get(
         lister.page_url(2),
         [{"json": response2}],
         additional_matcher=_match_request,
     )
 
     listed_result = lister.run()
 
     expected_nb_origins = len(response1) + len(response2)
     assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
 
     scheduler_origins = lister.scheduler.get_listed_origins(
         lister.lister_obj.id
     ).results
     assert len(scheduler_origins) == expected_nb_origins
 
     for listed_origin in scheduler_origins:
         assert listed_origin.visit_type == "git"
         assert listed_origin.url.startswith(f"https://{instance}")
         assert listed_origin.last_update is not None
 
 
 def test_lister_gitlab_incremental(swh_scheduler, requests_mock, datadir):
     """Gitlab lister supports incremental visits"""
     instance = "gite.lirmm.fr"
     url = api_url(instance)
     lister = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True)
 
     url_page1 = lister.page_url()
     response1 = gitlab_page_response(datadir, instance, 1)
     url_page2 = lister.page_url(2)
     response2 = gitlab_page_response(datadir, instance, 2)
     url_page3 = lister.page_url(3)
     response3 = gitlab_page_response(datadir, instance, 3)
 
     requests_mock.get(
         url_page1,
         [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}],
         additional_matcher=_match_request,
     )
     requests_mock.get(
         url_page2,
         [{"json": response2}],
         additional_matcher=_match_request,
     )
 
     listed_result = lister.run()
 
     expected_nb_origins = len(response1) + len(response2)
     assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
     assert lister.state.last_seen_next_link == url_page2
 
     lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True)
 
     # Lister will start back at the last stop
     requests_mock.get(
         url_page2,
         [{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}],
         additional_matcher=_match_request,
     )
     requests_mock.get(
         url_page3,
         [{"json": response3}],
         additional_matcher=_match_request,
     )
 
     listed_result2 = lister2.run()
 
     assert listed_result2 == ListerStats(
         pages=2, origins=len(response2) + len(response3)
     )
     assert lister2.state.last_seen_next_link == url_page3
 
     assert lister.lister_obj.id == lister2.lister_obj.id
     scheduler_origins = lister2.scheduler.get_listed_origins(
         lister2.lister_obj.id
     ).results
 
     assert len(scheduler_origins) == len(response1) + len(response2) + len(response3)
 
     for listed_origin in scheduler_origins:
         assert listed_origin.visit_type == "git"
         assert listed_origin.url.startswith(f"https://{instance}")
         assert listed_origin.last_update is not None
 
 
 def test_lister_gitlab_rate_limit(swh_scheduler, requests_mock, datadir, mocker):
     """Gitlab lister supports rate-limit"""
     instance = "gite.lirmm.fr"
     url = api_url(instance)
     lister = GitLabLister(swh_scheduler, url=url, instance=instance)
 
     url_page1 = lister.page_url()
     response1 = gitlab_page_response(datadir, instance, 1)
     url_page2 = lister.page_url(2)
     response2 = gitlab_page_response(datadir, instance, 2)
 
     requests_mock.get(
         url_page1,
         [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}],
         additional_matcher=_match_request,
     )
     requests_mock.get(
         url_page2,
         [
             # rate limited twice
             {"status_code": codes.forbidden, "headers": {"RateLimit-Remaining": "0"}},
             {"status_code": codes.forbidden, "headers": {"RateLimit-Remaining": "0"}},
             # ok
             {"json": response2},
         ],
         additional_matcher=_match_request,
     )
 
     # To avoid this test being too slow, we mock sleep within the retry behavior
     mock_sleep = mocker.patch.object(lister.get_page_result.retry, "sleep")
 
     listed_result = lister.run()
 
     expected_nb_origins = len(response1) + len(response2)
     assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
 
     assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE])
 
 
 @pytest.mark.parametrize("status_code", [502, 503, 520])
 def test_lister_gitlab_http_errors(
     swh_scheduler, requests_mock, datadir, mocker, status_code
 ):
     """Gitlab lister should retry requests when encountering HTTP 50x errors"""
     instance = "gite.lirmm.fr"
     url = api_url(instance)
     lister = GitLabLister(swh_scheduler, url=url, instance=instance)
 
     url_page1 = lister.page_url()
     response1 = gitlab_page_response(datadir, instance, 1)
     url_page2 = lister.page_url(2)
     response2 = gitlab_page_response(datadir, instance, 2)
 
     requests_mock.get(
         url_page1,
         [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}],
         additional_matcher=_match_request,
     )
     requests_mock.get(
         url_page2,
         [
             # first request ends up with error
             {"status_code": status_code},
             # second request is ok
             {"json": response2},
         ],
         additional_matcher=_match_request,
     )
 
     # To avoid this test being too slow, we mock sleep within the retry behavior
     mock_sleep = mocker.patch.object(lister.get_page_result.retry, "sleep")
 
     listed_result = lister.run()
 
     expected_nb_origins = len(response1) + len(response2)
     assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
 
     assert_sleep_calls(mocker, mock_sleep, [1])
 
 
 def test_lister_gitlab_http_error_500(swh_scheduler, requests_mock, datadir):
     """Gitlab lister should skip buggy URL and move to next page."""
     instance = "gite.lirmm.fr"
     url = api_url(instance)
     lister = GitLabLister(swh_scheduler, url=url, instance=instance)
 
     url_page1 = lister.page_url()
     response1 = gitlab_page_response(datadir, instance, 1)
     url_page2 = lister.page_url(lister.per_page)
     url_page3 = lister.page_url(2 * lister.per_page)
     response3 = gitlab_page_response(datadir, instance, 3)
 
     requests_mock.get(
         url_page1,
         [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}],
         additional_matcher=_match_request,
     )
     requests_mock.get(
         url_page2,
         [
             {"status_code": 500},
         ],
         additional_matcher=_match_request,
     )
 
     requests_mock.get(
         url_page3,
         [{"json": response3}],
         additional_matcher=_match_request,
     )
 
     listed_result = lister.run()
 
     expected_nb_origins = len(response1) + len(response3)
     assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
 
 
 def test_lister_gitlab_credentials(swh_scheduler):
     """Gitlab lister supports credentials configuration"""
     instance = "gitlab"
     credentials = {
         "gitlab": {instance: [{"username": "user", "password": "api-token"}]}
     }
     url = api_url(instance)
     lister = GitLabLister(
         scheduler=swh_scheduler, url=url, instance=instance, credentials=credentials
     )
     assert lister.session.headers["Authorization"] == "Bearer api-token"
 
 
 @pytest.mark.parametrize(
     "url",
     [
         api_url("gitlab").rstrip("/"),
         api_url("gitlab"),
     ],
 )
 def test_lister_gitlab_url_computation(url, swh_scheduler):
     lister = GitLabLister(scheduler=swh_scheduler, url=url)
     assert not lister.url.endswith("/")
 
     page_url = lister.page_url()
     # ensure the generated url contains the separated /
     assert page_url.startswith(f"{lister.url}/projects")
 
 
 @pytest.mark.parametrize(
     "url,expected_result",
     [
         (None, None),
         ("http://dummy/?query=1", None),
         ("http://dummy/?foo=bar&id_after=1&some=result", 1),
         ("http://dummy/?foo=bar&id_after=&some=result", None),
     ],
 )
 def test__parse_id_after(url, expected_result):
     assert _parse_id_after(url) == expected_result
+
+
+def test_lister_gitlab_ignored_project_prefixes(datadir, swh_scheduler, requests_mock):
+    """Gitlab lister supports listing with ignored project prefixes"""
+    instance = "gitlab.com"
+    lister = GitLabLister(
+        swh_scheduler,
+        url=api_url(instance),
+        instance=instance,
+        ignored_project_prefixes=["jonan/"],
+    )
+
+    response = gitlab_page_response(datadir, instance, 1)
+
+    requests_mock.get(
+        lister.page_url(),
+        [{"json": response}],
+        additional_matcher=_match_request,
+    )
+
+    listed_result = lister.run()
+    # 2 origins start with jonan/
+    expected_nb_origins = len(response) - 2
+    assert listed_result == ListerStats(pages=1, origins=expected_nb_origins)
+
+    scheduler_origins = lister.scheduler.get_listed_origins(
+        lister.lister_obj.id
+    ).results
+    assert len(scheduler_origins) == expected_nb_origins
+
+    for listed_origin in scheduler_origins:
+        assert listed_origin.visit_type == "git"
+        assert listed_origin.url.startswith(f"https://{instance}")
+        assert not listed_origin.url.startswith(f"https://{instance}/jonan/")
+        assert listed_origin.last_update is not None
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
index 65eca1f..721bdc2 100644
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -1,75 +1,81 @@
 # Copyright (C) 2019-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 from typing import Any, Iterator, Mapping, Optional
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 from .tree import GNUTree
 
 logger = logging.getLogger(__name__)
 
 GNUPageType = Mapping[str, Any]
 
 
 class GNULister(StatelessLister[GNUPageType]):
     """
     List all GNU projects and associated artifacts.
     """
 
     LISTER_NAME = "GNU"
     GNU_FTP_URL = "https://ftp.gnu.org"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url=self.GNU_FTP_URL,
             instance="GNU",
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         # no side-effect calls in constructor, if extra state is needed, as preconized
         # by the pattern docstring, this must happen in the get_pages method.
         self.gnu_tree: Optional[GNUTree] = None
 
     def get_pages(self) -> Iterator[GNUPageType]:
         """
         Yield a single page listing all GNU projects.
         """
         # first fetch the manifest to parse
         self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz")
         yield self.gnu_tree.projects
 
     def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]:
         """
         Iterate on all GNU projects and yield ListedOrigin instances.
         """
         assert self.lister_obj.id is not None
         assert self.gnu_tree is not None
 
         artifacts = self.gnu_tree.artifacts
 
         for project_name, project_info in page.items():
 
             origin_url = project_info["url"]
             last_update = iso8601.parse_date(project_info["time_modified"])
 
             logger.debug("Found origin %s last updated on %s", origin_url, last_update)
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type="tar",
                 last_update=last_update,
                 extra_loader_arguments={"artifacts": artifacts[project_name]},
             )
diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py
index ce8a398..cdc5576 100644
--- a/swh/lister/gogs/lister.py
+++ b/swh/lister/gogs/lister.py
@@ -1,208 +1,214 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import logging
 import random
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse
 
 import iso8601
 from requests.exceptions import HTTPError
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 Repo = Dict[str, Any]
 
 
 @dataclass
 class GogsListerPage:
     repos: Optional[List[Repo]] = None
     next_link: Optional[str] = None
 
 
 @dataclass
 class GogsListerState:
     last_seen_next_link: Optional[str] = None
     """Last link header (could be already visited) during an incremental pass."""
     last_seen_repo_id: Optional[int] = None
     """Last repo id seen during an incremental pass."""
 
 
 def _parse_page_id(url: Optional[str]) -> int:
     """Parse the page id from a Gogs page url."""
     if url is None:
         return 0
 
     return int(parse_qs(urlparse(url).query)["page"][0])
 
 
 class GogsLister(Lister[GogsListerState, GogsListerPage]):
 
     """List origins from the Gogs
 
     Gogs API documentation: https://github.com/gogs/docs-api
 
     The API may be protected behind authentication so credentials/API tokens can be
     provided.
 
     The lister supports pagination and provides next page URL through the 'next' value
     of the 'Link' header. The default value for page size ('limit') is 10 but the
     maximum allowed value is 50.
 
     Api can usually be found at the location: https://<host>/api/v1/repos/search
 
     """
 
     LISTER_NAME = "gogs"
 
     VISIT_TYPE = "git"
 
     REPO_LIST_PATH = "repos/search"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: Optional[str] = None,
         api_token: Optional[str] = None,
         page_size: int = 50,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=url,
             instance=instance,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.query_params = {
             "limit": page_size,
         }
 
         self.api_token = api_token
         if self.api_token is None:
 
             if len(self.credentials) > 0:
                 cred = random.choice(self.credentials)
                 username = cred.get("username")
                 self.api_token = cred["password"]
                 logger.info("Using authentication credentials from user %s", username)
 
         self.session.headers.update({"Accept": "application/json"})
 
         if self.api_token:
             self.session.headers["Authorization"] = f"token {self.api_token}"
         else:
             logger.warning(
                 "No authentication token set in configuration, using anonymous mode"
             )
 
     def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState:
         return GogsListerState(**d)
 
     def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def page_request(
         self, url: str, params: Dict[str, Any]
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
 
         logger.debug("Fetching URL %s with params %s", url, params)
 
         try:
             response = self.http_request(url, params=params)
         except HTTPError as http_error:
             if (
                 http_error.response.status_code == 500
             ):  # Temporary hack for skipping fatal repos (T4423)
                 url_parts = urlparse(url)
                 query: Dict[str, Any] = dict(parse_qsl(url_parts.query))
                 query.update({"page": _parse_page_id(url) + 1})
                 next_page_link = url_parts._replace(query=urlencode(query)).geturl()
                 body: Dict[str, Any] = {"data": []}
                 links = {"next": {"url": next_page_link}}
                 return body, links
             else:
                 raise
 
         return response.json(), response.links
 
     @classmethod
     def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]:
         fields_filter = ["id", "clone_url", "updated_at"]
         return [{k: r[k] for k in fields_filter} for r in body["data"]]
 
     def get_pages(self) -> Iterator[GogsListerPage]:
         page_id = 1
         if self.state.last_seen_next_link is not None:
             page_id = _parse_page_id(self.state.last_seen_next_link)
 
         # base with trailing slash, path without leading slash for urljoin
         next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH)
         assert next_link is not None
 
         body, links = self.page_request(
             next_link, {**self.query_params, "page": page_id}
         )
 
         while next_link is not None:
             repos = self.extract_repos(body)
 
             if "next" in links:
                 next_link = links["next"]["url"]
             else:
                 next_link = None  # Happens for the last page
 
             yield GogsListerPage(repos=repos, next_link=next_link)
 
             if next_link is not None:
                 body, links = self.page_request(next_link, {})
 
     def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]:
         """Convert a page of Gogs repositories into a list of ListedOrigins"""
         assert self.lister_obj.id is not None
         assert page.repos is not None
 
         for r in page.repos:
             last_update = iso8601.parse_date(r["updated_at"])
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=r["clone_url"],
                 last_update=last_update,
             )
 
     def commit_page(self, page: GogsListerPage) -> None:
         last_seen_next_link = page.next_link
 
         page_id = _parse_page_id(last_seen_next_link)
         state_page_id = _parse_page_id(self.state.last_seen_next_link)
 
         if page_id > state_page_id:
             self.state.last_seen_next_link = last_seen_next_link
 
         if (page.repos is not None) and len(page.repos) > 0:
             self.state.last_seen_repo_id = page.repos[-1]["id"]
 
     def finalize(self) -> None:
         scheduler_state = self.get_state_from_scheduler()
 
         state_page_id = _parse_page_id(self.state.last_seen_next_link)
         scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link)
 
         state_last_repo_id = self.state.last_seen_repo_id or 0
         scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0
 
         if (state_page_id >= scheduler_page_id) and (
             state_last_repo_id > scheduler_last_repo_id
         ):
             self.updated = True  # Marked updated only if it finds new repos
diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py
index 10e5935..36a247b 100644
--- a/swh/lister/golang/lister.py
+++ b/swh/lister/golang/lister.py
@@ -1,164 +1,170 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import dataclass
 from datetime import datetime
 import json
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class GolangStateType:
     last_seen: Optional[datetime] = None
     """Last timestamp of a package version we have saved.
     Used as a starting point for an incremental listing."""
 
 
 GolangPageType = List[Dict[str, Any]]
 
 
 class GolangLister(Lister[GolangStateType, GolangPageType]):
     """
     List all Golang modules and send associated origins to scheduler.
 
     The lister queries the Golang module index, whose documentation can be found
     at https://index.golang.org
     """
 
     GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index"
     # `limit` seems to be... limited to 2000.
     GOLANG_MODULES_INDEX_LIMIT = 2000
     LISTER_NAME = "golang"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         incremental: bool = False,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url=self.GOLANG_MODULES_INDEX_URL,
             instance=self.LISTER_NAME,
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/json"})
         self.incremental = incremental
 
     def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType:
         as_string = d.get("last_seen")
         last_seen = iso8601.parse_date(as_string) if as_string is not None else None
         return GolangStateType(last_seen=last_seen)
 
     def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]:
         return {
             "last_seen": state.last_seen.isoformat()
             if state.last_seen is not None
             else None
         }
 
     def finalize(self):
         if self.incremental and self.state.last_seen is not None:
             scheduler_state = self.get_state_from_scheduler()
 
             if (
                 scheduler_state.last_seen is None
                 or self.state.last_seen > scheduler_state.last_seen
             ):
                 self.updated = True
 
     def api_request(self, url: str) -> List[str]:
         response = self.http_request(url)
         return response.text.split()
 
     def get_single_page(
         self, since: Optional[datetime] = None
     ) -> Tuple[GolangPageType, Optional[datetime]]:
         """Return a page from the API and the timestamp of its last entry.
         Since all entries are sorted by chronological order, the timestamp is useful
         both for pagination and later for incremental runs."""
         url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}"
         if since is not None:
             # The Golang index does not understand `+00:00` for some reason
             # and expects the "timezone zero" notation instead. This works
             # because all times are UTC.
             utc_offset = since.utcoffset()
             assert (
                 utc_offset is not None and utc_offset.total_seconds() == 0
             ), "Non-UTC datetime"
             as_date = since.isoformat().replace("+00:00", "Z")
             url = f"{url}&since={as_date}"
 
         entries = self.api_request(url)
         page: GolangPageType = []
         if not entries:
             return page, since
 
         for as_json in entries:
             entry = json.loads(as_json)
             timestamp = iso8601.parse_date(entry["Timestamp"])
             # We've already parsed it and we'll need the datetime later, save it
             entry["Timestamp"] = timestamp
             page.append(entry)
             # The index is guaranteed to be sorted in chronological order
             since = timestamp
 
         return page, since
 
     def get_pages(self) -> Iterator[GolangPageType]:
         since = None
         if self.incremental:
             since = self.state.last_seen
         page, since = self.get_single_page(since=since)
         if since == self.state.last_seen:
             # The index returns packages whose timestamp are greater or
             # equal to the date provided as parameter, which will create
             # an infinite loop if not stopped here.
             return [], since
         if since is not None:
             self.state.last_seen = since
 
         while page:
             yield page
             page, since = self.get_single_page(since=since)
             if since == self.state.last_seen:
                 return [], since
             if since is not None:
                 self.state.last_seen = since
 
     def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]:
         """
         Iterate on all Golang projects and yield ListedOrigin instances.
         """
         assert self.lister_obj.id is not None
 
         for module in page:
             path = module["Path"]
             # The loader will be expected to use the golang proxy to do the
             # actual downloading. We're using `pkg.go.dev` so that the URL points
             # to somewhere useful for a human instead of an (incomplete) API path.
             origin_url = f"https://pkg.go.dev/{path}"
 
             # Since the Go index lists versions and not just packages, there will
             # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side,
             # so only the last timestamp will be used, with no duplicates.
             # Performance should not be an issue as they are sent to the db in bulk.
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type="golang",
                 last_update=module["Timestamp"],
             )
diff --git a/swh/lister/hackage/__init__.py b/swh/lister/hackage/__init__.py
index 906a878..cdc322a 100644
--- a/swh/lister/hackage/__init__.py
+++ b/swh/lister/hackage/__init__.py
@@ -1,99 +1,103 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 """
 Hackage lister
 ==============
 
 The Hackage lister list origins from `hackage.haskell.org`_, the `Haskell`_ Package
 Repository.
 
 The registry provide an `http api`_ from where the lister retrieve package names
 and build origins urls.
 
 As of August 2022 `hackage.haskell.org`_ list 15536 package names.
 
 Origins retrieving strategy
 ---------------------------
 
 To get a list of all package names we make a POST call to
-`https://hackage.haskell.org/packages/search` endpoint with some params given as
+``https://hackage.haskell.org/packages/search`` endpoint with some params given as
 json data.
 
 Default params::
 
     {
         "page": 0,
         "sortColumn": "default",
         "sortDirection": "ascending",
         "searchQuery": "(deprecated:any)",
     }
 
 The page size is 50. The lister will make has much http api call has needed to get
 all results.
 
+For incremental mode we expand the search query with ``lastUpload`` greater than
+``state.last_listing_date``, the api will return all new or updated package names since
+last run.
+
 Page listing
 ------------
 
 The result is paginated, each page is 50 records long.
 
 Entry data set example::
 
     {
         "description": "3D model parsers",
         "downloads": 6,
         "lastUpload": "2014-11-08T03:55:23.879047Z",
         "maintainers": [{"display": "capsjac", "uri": "/user/capsjac"}],
         "name": {"display": "3dmodels", "uri": "/package/3dmodels"},
         "tags": [
             {"display": "graphics", "uri": "/packages/tag/graphics"},
             {"display": "lgpl", "uri": "/packages/tag/lgpl"},
             {"display": "library", "uri": "/packages/tag/library"},
         ],
         "votes": 1.5,
     }
 
 Origins from page
 -----------------
 
 The lister yields 50 origins url per page.
-Each ListedOrigin has a `last_update` date set.
+Each ListedOrigin has a ``last_update`` date set.
 
 Running tests
 -------------
 
 Activate the virtualenv and run from within swh-lister directory::
 
    pytest -s -vv --log-cli-level=DEBUG swh/lister/hackage/tests
 
 Testing with Docker
 -------------------
 
 Change directory to swh/docker then launch the docker environment::
 
    docker compose up -d
 
 Then schedule an Hackage listing task::
 
    docker compose exec swh-scheduler swh scheduler task add -p oneshot list-hackage
 
 You can follow lister execution by displaying logs of swh-lister service::
 
    docker compose logs -f swh-lister
 
 .. _hackage.haskell.org: https://hackage.haskell.org/
 .. _Haskell: https://haskell.org/
 .. _http api: https://hackage.haskell.org/api
 """
 
 
 def register():
     from .lister import HackageLister
 
     return {
         "lister": HackageLister,
         "task_modules": ["%s.tasks" % __name__],
     }
diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py
index ffe72cc..a86ff67 100644
--- a/swh/lister/hackage/lister.py
+++ b/swh/lister/hackage/lister.py
@@ -1,100 +1,150 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from dataclasses import dataclass
+from datetime import datetime, timezone
 import logging
 from typing import Any, Dict, Iterator, List, Optional
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 HackageListerPage = List[Dict[str, Any]]
 
 
-class HackageLister(StatelessLister[HackageListerPage]):
+@dataclass
+class HackageListerState:
+    """Store lister state for incremental mode operations"""
+
+    last_listing_date: Optional[datetime] = None
+    """Last date when Hackage lister was executed"""
+
+
+class HackageLister(Lister[HackageListerState, HackageListerPage]):
     """List Hackage (The Haskell Package Repository) origins."""
 
     LISTER_NAME = "hackage"
     VISIT_TYPE = "hackage"
     INSTANCE = "hackage"
 
     BASE_URL = "https://hackage.haskell.org/"
     PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search"
     PACKAGE_INFO_URL_PATTERN = "{base_url}package/{pkgname}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         url: Optional[str] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=url if url else self.BASE_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         # Ensure to set this with same value as the http api search endpoint use
         # (50 as of august 2022)
         self.page_size: int = 50
+        self.listing_date = datetime.now().astimezone(tz=timezone.utc)
+
+    def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState:
+        last_listing_date = d.get("last_listing_date")
+        if last_listing_date is not None:
+            d["last_listing_date"] = iso8601.parse_date(last_listing_date)
+        return HackageListerState(**d)
+
+    def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]:
+        d: Dict[str, Optional[str]] = {"last_listing_date": None}
+        last_listing_date = state.last_listing_date
+        if last_listing_date is not None:
+            d["last_listing_date"] = last_listing_date.isoformat()
+        return d
 
     def get_pages(self) -> Iterator[HackageListerPage]:
         """Yield an iterator which returns 'page'
 
         It uses the http api endpoint `https://hackage.haskell.org/packages/search`
         to get a list of package names from which we build an origin url.
 
         Results are paginated.
         """
+        # Search query
+        sq = "(deprecated:any)"
+
+        if self.state.last_listing_date:
+            last_str = (
+                self.state.last_listing_date.astimezone(tz=timezone.utc)
+                .date()
+                .isoformat()
+            )
+
+            # Incremental mode search query
+            sq += "(lastUpload >= %s)" % last_str
+
         params = {
             "page": 0,
             "sortColumn": "default",
             "sortDirection": "ascending",
-            "searchQuery": "(deprecated:any)",
+            "searchQuery": sq,
         }
 
         data = self.http_request(
             url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
             method="POST",
             json=params,
         ).json()
 
-        nb_entries: int = data["numberOfResults"]
-        (nb_pages, remainder) = divmod(nb_entries, self.page_size)
-        if remainder:
-            nb_pages += 1
-        yield data["pageContents"]
-
-        for page in range(1, nb_pages):
-            params["page"] = page
-            data = self.http_request(
-                url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
-                method="POST",
-                json=params,
-            ).json()
+        if data.get("pageContents"):
+            nb_entries: int = data["numberOfResults"]
+            (nb_pages, remainder) = divmod(nb_entries, self.page_size)
+            if remainder:
+                nb_pages += 1
+            # First page
             yield data["pageContents"]
+            # Next pages
+            for page in range(1, nb_pages):
+                params["page"] = page
+                data = self.http_request(
+                    url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
+                    method="POST",
+                    json=params,
+                ).json()
+                yield data["pageContents"]
 
     def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
 
         for entry in page:
             pkgname = entry["name"]["display"]
             last_update = iso8601.parse_date(entry["lastUpload"])
             url = self.PACKAGE_INFO_URL_PATTERN.format(
                 base_url=self.url, pkgname=pkgname
             )
+
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=url,
                 last_update=last_update,
             )
+
+    def finalize(self) -> None:
+        self.state.last_listing_date = self.listing_date
+        self.updated = True
diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1 b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1
new file mode 100644
index 0000000..598d748
--- /dev/null
+++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit1
@@ -0,0 +1 @@
+{"numberOfResults":3,"pageContents":[{"description":"Translations of classic Truth Maintenance Systems","downloads":14,"lastUpload":"2022-09-13T19:21:15.533437837Z","maintainers":[{"display":"jpmrst","uri":"/user/jpmrst"}],"name":{"display":"BPS","uri":"/package/BPS"},"tags":[{"display":"gpl","uri":"/packages/tag/gpl"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"truth-maintenance","uri":"/packages/tag/truth-maintenance"}],"votes":0},{"description":"C-Structs implementation for Haskell","downloads":25,"lastUpload":"2022-09-30T08:00:34.348551203Z","maintainers":[{"display":"SimonPlakolb","uri":"/user/SimonPlakolb"}],"name":{"display":"C-structs","uri":"/package/C-structs"},"tags":[{"display":"c","uri":"/packages/tag/c"},{"display":"data","uri":"/packages/tag/data"},{"display":"foreign","uri":"/packages/tag/foreign"},{"display":"library","uri":"/packages/tag/library"},{"display":"mit","uri":"/packages/tag/mit"},{"display":"structures","uri":"/packages/tag/structures"}],"votes":2},{"description":"Cluster algorithms, PCA, and chemical conformere analysis","downloads":29,"lastUpload":"2022-09-28T11:54:25.8011197Z","maintainers":[{"display":"phillipseeber","uri":"/user/phillipseeber"}],"name":{"display":"ConClusion","uri":"/package/ConClusion"},"tags":[{"display":"agpl","uri":"/packages/tag/agpl"},{"display":"chemistry","uri":"/packages/tag/chemistry"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"statistics","uri":"/packages/tag/statistics"}],"votes":2}]}
diff --git a/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2 b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2
new file mode 100644
index 0000000..20d1ebd
--- /dev/null
+++ b/swh/lister/hackage/tests/data/https_hackage.haskell.org/packages_search_0_visit2
@@ -0,0 +1 @@
+{"numberOfResults":0,"pageContents":[]}
diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py
index 93bb6f4..80d4c49 100644
--- a/swh/lister/hackage/tests/test_lister.py
+++ b/swh/lister/hackage/tests/test_lister.py
@@ -1,100 +1,197 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import functools
 import json
 from pathlib import Path
 from urllib.parse import unquote, urlparse
 
-from swh.lister.hackage.lister import HackageLister
+import iso8601
 
+from swh.lister.hackage.lister import HackageLister, HackageListerState
 
-def json_callback(request, context, datadir):
-    """Callback for requests_mock that load a json file regarding a page number"""
-    page = request.json()["page"]
 
+def json_callback(request, context, datadir, visit=0):
+    """Callback for requests_mock that load a json file regarding a page number"""
     unquoted_url = unquote(request.url)
     url = urlparse(unquoted_url)
+    page = request.json()["page"]
+
     dirname = "%s_%s" % (url.scheme, url.hostname)
     filename = url.path[1:]
     if filename.endswith("/"):
         filename = filename[:-1]
     filename = filename.replace("/", "_")
+    filepath = Path(datadir, dirname, f"{filename}_{page}")
 
-    return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
+    if visit > 0:
+        filepath = filepath.parent / f"{filepath.stem}_visit{visit}"
+    return json.loads(filepath.read_text())
 
 
 def test_hackage_lister(swh_scheduler, requests_mock, datadir):
+    """Assert a full listing of 3 pages of 50 origins"""
 
     requests_mock.post(
         url="https://hackage.haskell.org/packages/search",
         status_code=200,
         json=functools.partial(json_callback, datadir=datadir),
     )
 
     expected_origins = []
 
     for page in [0, 1, 2]:
         data = json.loads(
             Path(
                 datadir, "https_hackage.haskell.org", f"packages_search_{page}"
             ).read_text()
         )
         for entry in data["pageContents"]:
             pkgname = entry["name"]["display"]
             expected_origins.append(
                 {"url": f"https://hackage.haskell.org/package/{pkgname}"}
             )
 
     lister = HackageLister(scheduler=swh_scheduler)
     res = lister.run()
 
     assert res.pages == 3
     assert res.origins == res.pages * 50
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     assert len(scheduler_origins) == len(expected_origins)
 
     assert {
         (
             scheduled.visit_type,
             scheduled.url,
         )
         for scheduled in scheduler_origins
     } == {
         (
             "hackage",
             expected["url"],
         )
         for expected in expected_origins
     }
 
 
 def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
+    """Test Pagination
+
+    Page size is 50, lister returns 1 page when origins < page size
+    """
     requests_mock.post(
         url="https://fake49.haskell.org/packages/search",
         status_code=200,
         json=functools.partial(json_callback, datadir=datadir),
     )
     lister = HackageLister(scheduler=swh_scheduler, url="https://fake49.haskell.org/")
     pages = list(lister.get_pages())
     # there should be 1 page with 49 entries
     assert len(pages) == 1
     assert len(pages[0]) == 49
 
 
 def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
+    """Test Pagination
+
+    Page size is 50, lister returns 2 page when origins > page size
+    """
     requests_mock.post(
         url="https://fake51.haskell.org/packages/search",
         status_code=200,
         json=functools.partial(json_callback, datadir=datadir),
     )
     lister = HackageLister(scheduler=swh_scheduler, url="https://fake51.haskell.org/")
     pages = list(lister.get_pages())
     # there should be 2 pages with 50 + 1 entries
     assert len(pages) == 2
     assert len(pages[0]) == 50
     assert len(pages[1]) == 1
+
+
+def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
+    """Test incremental lister
+
+    * First run, full listing, 3 pages, 150 origins
+    * Second run, 1 page, 3 new or updated origins
+    * Third run, nothing new, 0 page, 0 origins
+    """
+
+    mock_url = "https://hackage.haskell.org/packages/search"
+
+    # first run
+    requests_mock.post(
+        url=mock_url,
+        status_code=200,
+        json=functools.partial(json_callback, datadir=datadir),
+    )
+    lister = HackageLister(scheduler=swh_scheduler)
+    # force lister.last_listing_date to not being 'now'
+    lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
+    lister.set_state_in_scheduler()
+    assert lister.get_state_from_scheduler() == HackageListerState(
+        last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
+    )
+
+    first = lister.run()
+    assert first.pages == 3
+    assert first.origins == 3 * 50
+    # 3 http requests done
+    assert len(requests_mock.request_history) == 3
+    for rh in requests_mock.request_history:
+        assert rh.json()["searchQuery"] == "(deprecated:any)(lastUpload >= 2022-08-26)"
+
+    # second run
+    requests_mock.post(
+        url=mock_url,
+        status_code=200,
+        json=functools.partial(json_callback, datadir=datadir, visit=1),
+    )
+    lister = HackageLister(scheduler=swh_scheduler)
+    # force lister.last_listing_date to not being 'now'
+    lister.state.last_listing_date = iso8601.parse_date(
+        "2022-09-30T08:00:34.348551203Z"
+    )
+    lister.set_state_in_scheduler()
+    assert lister.get_state_from_scheduler() == HackageListerState(
+        last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
+    )
+
+    second = lister.run()
+    assert second.pages == 1
+    assert second.origins == 3
+
+    assert len(requests_mock.request_history) == 3 + 1
+    # Check the first three ones, should be the same as first run
+    for i in range(3):
+        assert (
+            requests_mock.request_history[i].json()["searchQuery"]
+            == "(deprecated:any)(lastUpload >= 2022-08-26)"
+        )
+    # Check the last one, lastUpload should be the same as second run
+    assert (
+        requests_mock.last_request.json()["searchQuery"]
+        == "(deprecated:any)(lastUpload >= 2022-09-30)"
+    )
+
+    # third run (no update since last run, no new or updated origins but one http requests
+    # with no results)
+    requests_mock.post(
+        url=mock_url,
+        status_code=200,
+        json=functools.partial(json_callback, datadir=datadir, visit=2),
+    )
+    lister = HackageLister(scheduler=swh_scheduler)
+    third = lister.run()
+
+    assert third.pages == 0
+    assert third.origins == 0
+    assert lister.get_state_from_scheduler() == HackageListerState(
+        last_listing_date=lister.state.last_listing_date
+    )
+    assert len(requests_mock.request_history) == 3 + 1 + 1
diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py
index e9c36fa..b9daa18 100644
--- a/swh/lister/launchpad/lister.py
+++ b/swh/lister/launchpad/lister.py
@@ -1,209 +1,215 @@
 # Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import dataclass
 from datetime import datetime
 import logging
 from typing import Any, Dict, Iterator, Optional, Tuple
 
 import iso8601
 from launchpadlib.launchpad import Launchpad
 from lazr.restfulclient.errors import RestfulError
 from lazr.restfulclient.resource import Collection
 from tenacity.before_sleep import before_sleep_log
 
 from swh.lister.utils import http_retry, retry_if_exception
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 VcsType = str
 LaunchpadPageType = Tuple[VcsType, Collection]
 
 
 SUPPORTED_VCS_TYPES = ("git", "bzr")
 
 
 @dataclass
 class LaunchpadListerState:
     """State of Launchpad lister"""
 
     git_date_last_modified: Optional[datetime] = None
     """modification date of last updated git repository since last listing"""
     bzr_date_last_modified: Optional[datetime] = None
     """modification date of last updated bzr repository since last listing"""
 
 
 def origin(vcs_type: str, repo: Any) -> str:
     """Determine the origin url out of a repository with a given vcs_type"""
     return repo.git_https_url if vcs_type == "git" else repo.web_link
 
 
 def retry_if_restful_error(retry_state):
     return retry_if_exception(retry_state, lambda e: isinstance(e, RestfulError))
 
 
 class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]):
     """
     List repositories from Launchpad (git or bzr).
 
     Args:
         scheduler: instance of SchedulerInterface
         incremental: defines if incremental listing should be used, in that case
             only modified or new repositories since last incremental listing operation
             will be returned
     """
 
     LISTER_NAME = "launchpad"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         incremental: bool = False,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url="https://launchpad.net/",
             instance="launchpad",
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         self.incremental = incremental
         self.date_last_modified: Dict[str, Optional[datetime]] = {
             "git": None,
             "bzr": None,
         }
 
     def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState:
         for vcs_type in SUPPORTED_VCS_TYPES:
             key = f"{vcs_type}_date_last_modified"
             date_last_modified = d.get(key)
             if date_last_modified is not None:
                 d[key] = iso8601.parse_date(date_last_modified)
 
         return LaunchpadListerState(**d)
 
     def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]:
         d: Dict[str, Optional[str]] = {}
         for vcs_type in SUPPORTED_VCS_TYPES:
             attribute_name = f"{vcs_type}_date_last_modified"
             d[attribute_name] = None
 
             if hasattr(state, attribute_name):
                 date_last_modified = getattr(state, attribute_name)
                 if date_last_modified is not None:
                     d[attribute_name] = date_last_modified.isoformat()
         return d
 
     @http_retry(
         retry=retry_if_restful_error,
         before_sleep=before_sleep_log(logger, logging.WARNING),
     )
     def _page_request(
         self, launchpad, vcs_type: str, date_last_modified: Optional[datetime]
     ) -> Optional[Collection]:
         """Querying the page of results for a given vcs_type since the date_last_modified. If
         some issues occurs, this will deal with the retrying policy.
 
         """
         get_vcs_fns = {
             "git": launchpad.git_repositories.getRepositories,
             "bzr": launchpad.branches.getBranches,
         }
 
         return get_vcs_fns[vcs_type](
             order_by="most neglected first",
             modified_since_date=date_last_modified,
         )
 
     def get_pages(self) -> Iterator[LaunchpadPageType]:
         """
         Yields an iterator on all git/bzr repositories hosted on Launchpad sorted
         by last modification date in ascending order.
         """
         launchpad = Launchpad.login_anonymously(
             "softwareheritage", "production", version="devel"
         )
         if self.incremental:
             self.date_last_modified = {
                 "git": self.state.git_date_last_modified,
                 "bzr": self.state.bzr_date_last_modified,
             }
         for vcs_type in SUPPORTED_VCS_TYPES:
             try:
                 result = self._page_request(
                     launchpad, vcs_type, self.date_last_modified[vcs_type]
                 )
             except RestfulError as e:
                 logger.warning("Listing %s origins raised %s", vcs_type, e)
                 result = None
             if not result:
                 continue
             yield vcs_type, result
 
     def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]:
         """
         Iterate on all git repositories and yield ListedOrigin instances.
         """
         assert self.lister_obj.id is not None
 
         vcs_type, repos = page
 
         try:
             for repo in repos:
                 origin_url = origin(vcs_type, repo)
 
                 # filter out origins with invalid URL
                 if not origin_url.startswith("https://"):
                     continue
 
                 last_update = repo.date_last_modified
 
                 self.date_last_modified[vcs_type] = last_update
 
                 logger.debug(
                     "Found origin %s with type %s last updated on %s",
                     origin_url,
                     vcs_type,
                     last_update,
                 )
 
                 yield ListedOrigin(
                     lister_id=self.lister_obj.id,
                     visit_type=vcs_type,
                     url=origin_url,
                     last_update=last_update,
                 )
         except RestfulError as e:
             logger.warning("Listing %s origins raised %s", vcs_type, e)
 
     def finalize(self) -> None:
         git_date_last_modified = self.date_last_modified["git"]
         bzr_date_last_modified = self.date_last_modified["bzr"]
         if git_date_last_modified is None and bzr_date_last_modified is None:
             return
 
         if self.incremental and (
             self.state.git_date_last_modified is None
             or (
                 git_date_last_modified is not None
                 and git_date_last_modified > self.state.git_date_last_modified
             )
         ):
             self.state.git_date_last_modified = git_date_last_modified
 
         if self.incremental and (
             self.state.bzr_date_last_modified is None
             or (
                 bzr_date_last_modified is not None
                 and bzr_date_last_modified > self.state.bzr_date_last_modified
             )
         ):
             self.state.bzr_date_last_modified = self.date_last_modified["bzr"]
 
         self.updated = True
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
index 195a8a3..8dc702c 100644
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -1,403 +1,409 @@
 # Copyright (C) 2021-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 from datetime import datetime, timezone
 import logging
 import re
 from typing import Any, Dict, Iterator, Optional
 from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
 import lxml
 import requests
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 RepoPage = Dict[str, Any]
 
 SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr")
 
 
 @dataclass
 class MavenListerState:
     """State of the MavenLister"""
 
     last_seen_doc: int = -1
     """Last doc ID ingested during an incremental pass
 
     """
 
     last_seen_pom: int = -1
     """Last doc ID related to a pom and ingested during
        an incremental pass
 
     """
 
 
 class MavenLister(Lister[MavenListerState, RepoPage]):
     """List origins from a Maven repository.
 
     Maven Central provides artifacts for Java builds.
     It includes POM files and source archives, which we download to get
     the source code of artifacts and links to their scm repository.
 
     This lister yields origins of types: git/svn/hg or whatever the Artifacts
     use as repository type, plus maven types for the maven loader (tgz, jar)."""
 
     LISTER_NAME = "maven"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         index_url: str = None,
         instance: Optional[str] = None,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         incremental: bool = True,
     ):
         """Lister class for Maven repositories.
 
         Args:
             url: main URL of the Maven repository, i.e. url of the base index
                 used to fetch maven artifacts. For Maven central use
                 https://repo1.maven.org/maven2/
             index_url: the URL to download the exported text indexes from.
                 Would typically be a local host running the export docker image.
                 See README.md in this directory for more information.
             instance: Name of maven instance. Defaults to url's network location
                 if unset.
             incremental: bool, defaults to True. Defines if incremental listing
                 is activated or not.
 
         """
         self.BASE_URL = url
         self.INDEX_URL = index_url
         self.incremental = incremental
 
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=url,
             instance=instance,
             with_github_session=True,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/json"})
 
         self.jar_origins: Dict[str, ListedOrigin] = {}
 
     def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
         return MavenListerState(**d)
 
     def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def get_pages(self) -> Iterator[RepoPage]:
         """Retrieve and parse exported maven indexes to
         identify all pom files and src archives.
         """
 
         # Example of returned RepoPage's:
         # [
         #   {
         #     "type": "maven",
         #     "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
         #     "time": 1626109619335,
         #     "gid": "org.xwiki.platform",
         #     "aid": "xwiki-platform-wikistream-events-xwiki",
         #     "version": "5.4.2"
         #   },
         #   {
         #     "type": "scm",
         #     "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
         #     "project": "openengsb-framework",
         #   },
         #   ...
         # ]
 
         # Download the main text index file.
         logger.info("Downloading computed index from %s.", self.INDEX_URL)
         assert self.INDEX_URL is not None
         try:
             response = self.http_request(self.INDEX_URL, stream=True)
         except requests.HTTPError:
             logger.error("Index %s not found, stopping", self.INDEX_URL)
             raise
 
         # Prepare regexes to parse index exports.
 
         # Parse doc id.
         # Example line: "doc 13"
         re_doc = re.compile(r"^doc (?P<doc>\d+)$")
 
         # Parse gid, aid, version, classifier, extension.
         # Example line: "    value al.aldi|sprova4j|0.1.0|sources|jar"
         re_val = re.compile(
             r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"
             + r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"
         )
 
         # Parse last modification time.
         # Example line: "    value jar|1626109619335|14316|2|2|0|jar"
         re_time = re.compile(
             r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
             + r"\|([^|]+)\|([^|]+)$"
         )
 
         # Read file line by line and process it
         out_pom: Dict = {}
         jar_src: Dict = {}
         doc_id: int = 0
         jar_src["doc"] = None
         url_src = None
 
         iterator = response.iter_lines(chunk_size=1024)
         for line_bytes in iterator:
             # Read the index text export and get URLs and SCMs.
             line = line_bytes.decode(errors="ignore")
             m_doc = re_doc.match(line)
             if m_doc is not None:
                 doc_id = int(m_doc.group("doc"))
                 # jar_src["doc"] contains the id of the current document, whatever
                 # its type (scm or jar).
                 jar_src["doc"] = doc_id
             else:
                 m_val = re_val.match(line)
                 if m_val is not None:
                     (gid, aid, version, classifier, ext) = m_val.groups()
                     ext = ext.strip()
                     path = "/".join(gid.split("."))
                     if classifier == "NA" and ext.lower() == "pom":
                         # If incremental mode, we don't record any line that is
                         # before our last recorded doc id.
                         if (
                             self.incremental
                             and self.state
                             and self.state.last_seen_pom
                             and self.state.last_seen_pom >= doc_id
                         ):
                             continue
                         url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"
                         url_pom = urljoin(
                             self.BASE_URL,
                             url_path,
                         )
                         out_pom[url_pom] = doc_id
                     elif (
                         classifier.lower() == "sources" or ("src" in classifier)
                     ) and ext.lower() in ("zip", "jar"):
                         url_path = (
                             f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"
                         )
                         url_src = urljoin(self.BASE_URL, url_path)
                         jar_src["gid"] = gid
                         jar_src["aid"] = aid
                         jar_src["version"] = version
                 else:
                     m_time = re_time.match(line)
                     if m_time is not None and url_src is not None:
                         time = m_time.group("mtime")
                         jar_src["time"] = int(time)
                         artifact_metadata_d = {
                             "type": "maven",
                             "url": url_src,
                             **jar_src,
                         }
                         logger.debug(
                             "* Yielding jar %s: %s", url_src, artifact_metadata_d
                         )
                         yield artifact_metadata_d
                         url_src = None
 
         logger.info("Found %s poms.", len(out_pom))
 
         # Now fetch pom files and scan them for scm info.
 
         logger.info("Fetching poms..")
         for pom_url in out_pom:
             try:
                 response = self.http_request(pom_url)
                 parsed_pom = BeautifulSoup(response.content, "xml")
                 project = parsed_pom.find("project")
                 if project is None:
                     continue
                 scm = project.find("scm")
                 if scm is not None:
                     connection = scm.find("connection")
                     if connection is not None:
                         artifact_metadata_d = {
                             "type": "scm",
                             "doc": out_pom[pom_url],
                             "url": connection.text,
                         }
                         logger.debug(
                             "* Yielding pom %s: %s", pom_url, artifact_metadata_d
                         )
                         yield artifact_metadata_d
                     else:
                         logger.debug("No scm.connection in pom %s", pom_url)
                 else:
                     logger.debug("No scm in pom %s", pom_url)
             except requests.HTTPError:
                 logger.warning(
                     "POM info page could not be fetched, skipping project '%s'",
                     pom_url,
                 )
             except lxml.etree.Error as error:
                 logger.info("Could not parse POM %s XML: %s.", pom_url, error)
 
     def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
         """Retrieve scm origin out of the page information. Only called when type of the
         page is scm.
 
         Try and detect an scm/vcs repository. Note that official format is in the form:
         scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
         the repo url (without the "scm:type"), so we have to check against the content
         to extract the type and url properly.
 
         Raises
             AssertionError when the type of the page is not 'scm'
 
         Returns
             ListedOrigin with proper canonical scm url (for github) if any is found,
             None otherwise.
 
         """
 
         assert page["type"] == "scm"
         visit_type: Optional[str] = None
         url: Optional[str] = None
         m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
         if m_scm is None:
             return None
 
         scm_type = m_scm.group("type")
         if scm_type and scm_type in SUPPORTED_SCM_TYPES:
             url = m_scm.group("url")
             visit_type = scm_type
         elif page["url"].endswith(".git"):
             url = page["url"].lstrip("scm:")
             visit_type = "git"
         else:
             return None
 
         if url and visit_type == "git":
             assert self.github_session is not None
             # Non-github urls will be returned as is, github ones will be canonical ones
             url = self.github_session.get_canonical_url(url)
 
         if not url:
             return None
 
         assert visit_type is not None
         assert self.lister_obj.id is not None
         return ListedOrigin(
             lister_id=self.lister_obj.id,
             url=url,
             visit_type=visit_type,
         )
 
     def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
 
         """Convert a page of Maven repositories into a list of ListedOrigins."""
         if page["type"] == "scm":
             listed_origin = self.get_scm(page)
             if listed_origin:
                 yield listed_origin
         else:
             # Origin is gathering source archives:
             last_update_dt = None
             last_update_iso = ""
             try:
                 last_update_seconds = str(page["time"])[:-3]
                 last_update_dt = datetime.fromtimestamp(int(last_update_seconds))
                 last_update_dt = last_update_dt.astimezone(timezone.utc)
             except (OverflowError, ValueError):
                 logger.warning("- Failed to convert datetime %s.", last_update_seconds)
             if last_update_dt:
                 last_update_iso = last_update_dt.isoformat()
 
             # Origin URL will target page holding sources for all versions of
             # an artifactId (package name) inside a groupId (namespace)
             path = "/".join(page["gid"].split("."))
             origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}")
 
             artifact = {
                 **{k: v for k, v in page.items() if k != "doc"},
                 "time": last_update_iso,
                 "base_url": self.BASE_URL,
             }
 
             if origin_url not in self.jar_origins:
                 # Create ListedOrigin instance if we did not see that origin yet
                 assert self.lister_obj.id is not None
                 jar_origin = ListedOrigin(
                     lister_id=self.lister_obj.id,
                     url=origin_url,
                     visit_type=page["type"],
                     last_update=last_update_dt,
                     extra_loader_arguments={"artifacts": [artifact]},
                 )
                 self.jar_origins[origin_url] = jar_origin
             else:
                 # Update list of source artifacts for that origin otherwise
                 jar_origin = self.jar_origins[origin_url]
                 artifacts = jar_origin.extra_loader_arguments["artifacts"]
                 if artifact not in artifacts:
                     artifacts.append(artifact)
 
             if (
                 jar_origin.last_update
                 and last_update_dt
                 and last_update_dt > jar_origin.last_update
             ):
                 jar_origin.last_update = last_update_dt
 
             if not self.incremental or (
                 self.state and page["doc"] > self.state.last_seen_doc
             ):
                 # Yield origin with updated source artifacts, multiple instances of
                 # ListedOrigin for the same origin URL but with different artifacts
                 # list will be sent to the scheduler but it will deduplicate them and
                 # take the latest one to upsert in database
                 yield jar_origin
 
     def commit_page(self, page: RepoPage) -> None:
         """Update currently stored state using the latest listed doc.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental and self.state:
             # We need to differentiate the two state counters according
             # to the type of origin.
             if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:
                 self.state.last_seen_doc = page["doc"]
             elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:
                 self.state.last_seen_doc = page["doc"]
                 self.state.last_seen_pom = page["doc"]
 
     def finalize(self) -> None:
         """Finalize the lister state, set update if any progress has been made.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental and self.state:
             last_seen_doc = self.state.last_seen_doc
             last_seen_pom = self.state.last_seen_pom
 
             scheduler_state = self.get_state_from_scheduler()
             if last_seen_doc and last_seen_pom:
                 if (scheduler_state.last_seen_doc < last_seen_doc) or (
                     scheduler_state.last_seen_pom < last_seen_pom
                 ):
                     self.updated = True
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
index 3e410aa..3440a8e 100644
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -1,566 +1,572 @@
 # Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 """NixGuix lister definition.
 
 This lists artifacts out of manifest for Guix or Nixpkgs manifests.
 
 Artifacts can be of types:
 - upstream git repository (NixOS/nixpkgs, Guix)
 - VCS repositories (svn, git, hg, ...)
 - unique file
 - unique tarball
 
 """
 
 import base64
 import binascii
 from dataclasses import dataclass
 from enum import Enum
 import logging
 from pathlib import Path
 import random
 import re
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 from urllib.parse import parse_qsl, urlparse
 
 import requests
 from requests.exceptions import ConnectionError, InvalidSchema, SSLError
 
 from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
 from swh.lister import TARBALL_EXTENSIONS
 from swh.lister.pattern import CredentialsType, StatelessLister
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 
 # By default, ignore binary files and archives containing binaries
 DEFAULT_EXTENSIONS_TO_IGNORE = [
     "AppImage",
     "bin",
     "exe",
     "iso",
     "linux64",
     "msi",
     "png",
     "dic",
     "deb",
     "rpm",
 ]
 
 
 class ArtifactNatureUndetected(ValueError):
     """Raised when a remote artifact's nature (tarball, file) cannot be detected."""
 
     pass
 
 
 class ArtifactNatureMistyped(ValueError):
     """Raised when a remote artifact is neither a tarball nor a file.
 
     Error of this type are' probably a misconfiguration in the manifest generation that
     badly typed a vcs repository.
 
     """
 
     pass
 
 
 class ArtifactWithoutExtension(ValueError):
     """Raised when an artifact nature cannot be determined by its name."""
 
     pass
 
 
 class ChecksumsComputation(Enum):
     """The possible artifact types listed out of the manifest."""
 
     STANDARD = "standard"
     """Standard checksums (e.g. sha1, sha256, ...) on the tarball or file."""
     NAR = "nar"
     """The hash is computed over the NAR archive dump of the output (e.g. uncompressed
     directory.)"""
 
 
 MAPPING_CHECKSUMS_COMPUTATION = {
     "flat": ChecksumsComputation.STANDARD,
     "recursive": ChecksumsComputation.NAR,
 }
 """Mapping between the outputHashMode from the manifest and how to compute checksums."""
 
 
 @dataclass
 class Artifact:
     """Metadata information on Remote Artifact with url (tarball or file)."""
 
     origin: str
     """Canonical url retrieve the tarball artifact."""
     visit_type: str
     """Either 'tar' or 'file' """
     fallback_urls: List[str]
     """List of urls to retrieve tarball artifact if canonical url no longer works."""
     checksums: Dict[str, str]
     """Integrity hash converted into a checksum dict."""
     checksums_computation: ChecksumsComputation
     """Checksums computation mode to provide to loaders (e.g. nar, standard, ...)"""
 
 
 @dataclass
 class VCS:
     """Metadata information on VCS."""
 
     origin: str
     """Origin url of the vcs"""
     type: str
     """Type of (d)vcs, e.g. svn, git, hg, ..."""
     ref: Optional[str] = None
     """Reference either a svn commit id, a git commit, ..."""
 
 
 class ArtifactType(Enum):
     """The possible artifact types listed out of the manifest."""
 
     ARTIFACT = "artifact"
     VCS = "vcs"
 
 
 PageResult = Tuple[ArtifactType, Union[Artifact, VCS]]
 
 
 VCS_SUPPORTED = ("git", "svn", "hg")
 
 # Rough approximation of what we can find of mimetypes for tarballs "out there"
 POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
 
 
 PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
 
 
 def url_endswith(
     urlparsed, extensions: List[str], raise_when_no_extension: bool = True
 ) -> bool:
     """Determine whether urlparsed ends with one of the extensions passed as parameter.
 
     This also account for the edge case of a filename with only a version as name (so no
     extension in the end.)
 
     Raises:
         ArtifactWithoutExtension in case no extension is available and
         raise_when_no_extension is True (the default)
 
     """
     paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
     if raise_when_no_extension and not any(path.suffix != "" for path in paths):
         raise ArtifactWithoutExtension
     match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
     if match:
         return match
     # Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
     # to catch those
     name = Path(urlparsed.path).name
     if not PATTERN_VERSION.match(name):
         return match
     if raise_when_no_extension:
         raise ArtifactWithoutExtension
     return False
 
 
 def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
     """Determine whether a list of files actually are tarballs or simple files.
 
     When this cannot be answered simply out of the url, when request is provided, this
     executes a HTTP `HEAD` query on the url to determine the information. If request is
     not provided, this raises an ArtifactNatureUndetected exception.
 
     Args:
         urls: name of the remote files for which the extension needs to be checked.
 
     Raises:
         ArtifactNatureUndetected when the artifact's nature cannot be detected out
             of its url
         ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
             the caller to do what's right with it.
 
     Returns: A tuple (bool, url). The boolean represents whether the url is an archive
         or not. The second parameter is the actual url once the head request is issued
         as a fallback of not finding out whether the urls are tarballs or not.
 
     """
 
     def _is_tarball(url):
         """Determine out of an extension whether url is a tarball.
 
         Raises:
             ArtifactWithoutExtension in case no extension is available
 
         """
         urlparsed = urlparse(url)
         if urlparsed.scheme not in ("http", "https", "ftp"):
             raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
         return url_endswith(urlparsed, TARBALL_EXTENSIONS)
 
     index = random.randrange(len(urls))
     url = urls[index]
 
     try:
         return _is_tarball(url), urls[0]
     except ArtifactWithoutExtension:
         if request is None:
             raise ArtifactNatureUndetected(
                 f"Cannot determine artifact type from url <{url}>"
             )
         logger.warning(
             "Cannot detect extension for <%s>. Fallback to http head query",
             url,
         )
 
         try:
             response = request.head(url)
         except (InvalidSchema, SSLError, ConnectionError):
             raise ArtifactNatureUndetected(
                 f"Cannot determine artifact type from url <{url}>"
             )
 
         if not response.ok or response.status_code == 404:
             raise ArtifactNatureUndetected(
                 f"Cannot determine artifact type from url <{url}>"
             )
         location = response.headers.get("Location")
         if location:  # It's not always present
             logger.debug("Location: %s", location)
             try:
                 # FIXME: location is also returned as it's considered the true origin,
                 # true enough?
                 return _is_tarball(location), location
             except ArtifactWithoutExtension:
                 logger.warning(
                     "Still cannot detect extension through location <%s>...",
                     url,
                 )
 
         origin = urls[0]
 
         content_type = response.headers.get("Content-Type")
         if content_type:
             logger.debug("Content-Type: %s", content_type)
             if content_type == "application/json":
                 return False, origin
             return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
 
         content_disposition = response.headers.get("Content-Disposition")
         if content_disposition:
             logger.debug("Content-Disposition: %s", content_disposition)
             if "filename=" in content_disposition:
                 fields = content_disposition.split("; ")
                 for field in fields:
                     if "filename=" in field:
                         _, filename = field.split("filename=")
                         break
 
                 return (
                     url_endswith(
                         urlparse(filename),
                         TARBALL_EXTENSIONS,
                         raise_when_no_extension=False,
                     ),
                     origin,
                 )
 
         raise ArtifactNatureUndetected(
             f"Cannot determine artifact type from url <{url}>"
         )
 
 
 VCS_KEYS_MAPPING = {
     "git": {
         "ref": "git_ref",
         "url": "git_url",
     },
     "svn": {
         "ref": "svn_revision",
         "url": "svn_url",
     },
     "hg": {
         "ref": "hg_changeset",
         "url": "hg_url",
     },
 }
 
 
 class NixGuixLister(StatelessLister[PageResult]):
     """List Guix or Nix sources out of a public json manifest.
 
     This lister can output:
     - unique tarball (.tar.gz, .tbz2, ...)
     - vcs repositories (e.g. git, hg, svn)
     - unique file (.lisp, .py, ...)
 
     Note that no `last_update` is available in either manifest.
 
     For `url` types artifacts, this tries to determine the artifact's nature, tarball or
     file. It first tries to compute out of the "url" extension. In case of no extension,
     it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
     response header, and then checks the extension again.
 
     Optionally, when the `extension_to_ignore` parameter is provided, it extends the
     default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
     This can be used to drop further binary files detected in the wild.
 
     """
 
     LISTER_NAME = "nixguix"
 
     def __init__(
         self,
         scheduler,
         url: str,
         origin_upstream: str,
         instance: Optional[str] = None,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         # canonicalize urls, can be turned off during docker runs
         canonicalize: bool = True,
         extensions_to_ignore: List[str] = [],
         **kwargs: Any,
     ):
         super().__init__(
             scheduler=scheduler,
             url=url.rstrip("/"),
             instance=instance,
             credentials=credentials,
             with_github_session=canonicalize,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         # either full fqdn NixOS/nixpkgs or guix repository urls
         # maybe add an assert on those specific urls?
         self.origin_upstream = origin_upstream
         self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
 
         self.session = requests.Session()
 
     def build_artifact(
         self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None
     ) -> Optional[Tuple[ArtifactType, VCS]]:
         """Build a canonicalized vcs artifact when possible."""
         origin = (
             self.github_session.get_canonical_url(artifact_url)
             if self.github_session
             else artifact_url
         )
         if not origin:
             return None
         return ArtifactType.VCS, VCS(
             origin=origin, type=artifact_type, ref=artifact_ref
         )
 
     def get_pages(self) -> Iterator[PageResult]:
         """Yield one page per "typed" origin referenced in manifest."""
         # fetch and parse the manifest...
         response = self.http_request(self.url)
 
         # ... if any
         raw_data = response.json()
         yield ArtifactType.VCS, VCS(origin=self.origin_upstream, type="git")
 
         # grep '"type"' guix-sources.json | sort | uniq
         #       "type": false                             <<<<<<<<< noise
         #       "type": "git",
         #       "type": "hg",
         #       "type": "no-origin",                      <<<<<<<<< noise
         #       "type": "svn",
         #       "type": "url",
 
         # grep '"type"' nixpkgs-sources-unstable.json | sort | uniq
         #  "type": "url",
 
         sources = raw_data["sources"]
         random.shuffle(sources)
 
         for artifact in sources:
             artifact_type = artifact["type"]
             if artifact_type in VCS_SUPPORTED:
                 plain_url = artifact[VCS_KEYS_MAPPING[artifact_type]["url"]]
                 plain_ref = artifact[VCS_KEYS_MAPPING[artifact_type]["ref"]]
                 built_artifact = self.build_artifact(
                     plain_url, artifact_type, plain_ref
                 )
                 if not built_artifact:
                     continue
                 yield built_artifact
             elif artifact_type == "url":
                 # It's either a tarball or a file
                 origin_urls = artifact.get("urls")
                 if not origin_urls:
                     # Nothing to fetch
                     logger.warning("Skipping url <%s>: empty artifact", artifact)
                     continue
 
                 assert origin_urls is not None
 
                 # Deal with urls with empty scheme (basic fallback to http)
                 urls = []
                 for url in origin_urls:
                     urlparsed = urlparse(url)
                     if urlparsed.scheme == "" and not re.match(r"^\w+@[^/]+:", url):
                         logger.warning("Missing scheme for <%s>: fallback to http", url)
                         fixed_url = f"http://{url}"
                     else:
                         fixed_url = url
                     urls.append(fixed_url)
 
                 origin, *fallback_urls = urls
 
                 if origin.endswith(".git"):
                     built_artifact = self.build_artifact(origin, "git")
                     if not built_artifact:
                         continue
                     yield built_artifact
                     continue
 
                 outputHash = artifact.get("outputHash")
                 integrity = artifact.get("integrity")
                 if integrity is None and outputHash is None:
                     logger.warning(
                         "Skipping url <%s>: missing integrity and outputHash field",
                         origin,
                     )
                     continue
 
                 # Falls back to outputHash field if integrity is missing
                 if integrity is None and outputHash:
                     # We'll deal with outputHash as integrity field
                     integrity = outputHash
 
                 try:
                     is_tar, origin = is_tarball(urls, self.session)
                 except ArtifactNatureMistyped:
                     logger.warning(
                         "Mistyped url <%s>: trying to deal with it properly", origin
                     )
                     urlparsed = urlparse(origin)
                     artifact_type = urlparsed.scheme
 
                     if artifact_type in VCS_SUPPORTED:
                         built_artifact = self.build_artifact(origin, artifact_type)
                         if not built_artifact:
                             continue
                         yield built_artifact
                     else:
                         logger.warning(
                             "Skipping url <%s>: undetected remote artifact type", origin
                         )
                     continue
                 except ArtifactNatureUndetected:
                     logger.warning(
                         "Skipping url <%s>: undetected remote artifact type", origin
                     )
                     continue
 
                 # Determine the content checksum stored in the integrity field and
                 # convert into a dict of checksums. This only parses the
                 # `hash-expression` (hash-<b64-encoded-checksum>) as defined in
                 # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute
                 try:
                     chksum_algo, chksum_b64 = integrity.split("-")
                     checksums: Dict[str, str] = {
                         chksum_algo: base64.decodebytes(chksum_b64.encode()).hex()
                     }
                 except binascii.Error:
                     logger.exception(
                         "Skipping url: <%s>: integrity computation failure for <%s>",
                         url,
                         artifact,
                     )
                     continue
 
                 # The 'outputHashMode' attribute determines how the hash is computed. It
                 # must be one of the following two values:
                 # - "flat": (default) The output must be a non-executable regular file.
                 #     If it isn’t, the build fails. The hash is simply computed over the
                 #     contents of that file (so it’s equal to what Unix commands like
                 #     `sha256sum` or `sha1sum` produce).
                 # - "recursive": The hash is computed over the NAR archive dump of the
                 #       output (i.e., the result of `nix-store --dump`). In this case,
                 #       the output can be anything, including a directory tree.
                 outputHashMode = artifact.get("outputHashMode", "flat")
 
                 if not is_tar and outputHashMode == "recursive":
                     # T4608: Cannot deal with those properly yet as some can be missing
                     # 'critical' information about how to recompute the hash (e.g. fs
                     # layout, executable bit, ...)
                     logger.warning(
                         "Skipping artifact <%s>: 'file' artifact of type <%s> is"
                         " missing information to properly check its integrity",
                         artifact,
                         artifact_type,
                     )
                     continue
 
                 # At this point plenty of heuristics happened and we should have found
                 # the right origin and its nature.
 
                 # Let's check and filter it out if it is to be ignored (if possible).
                 # Some origin urls may not have extension at this point (e.g
                 # http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
                 if url_endswith(
                     urlparse(origin),
                     self.extensions_to_ignore,
                     raise_when_no_extension=False,
                 ):
                     logger.warning(
                         "Skipping artifact <%s>: 'file' artifact of type <%s> is"
                         " ignored due to lister configuration. It should ignore"
                         " origins with extension [%s]",
                         origin,
                         artifact_type,
                         ",".join(self.extensions_to_ignore),
                     )
                     continue
 
                 logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
                 yield ArtifactType.ARTIFACT, Artifact(
                     origin=origin,
                     fallback_urls=fallback_urls,
                     checksums=checksums,
                     checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode],
                     visit_type="directory" if is_tar else "content",
                 )
             else:
                 logger.warning(
                     "Skipping artifact <%s>: unsupported type %s",
                     artifact,
                     artifact_type,
                 )
 
     def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]:
         """Given a vcs repository, yield a ListedOrigin."""
         assert self.lister_obj.id is not None
         # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...)
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             url=artifact.origin,
             visit_type=artifact.type,
         )
 
     def artifact_to_listed_origin(self, artifact: Artifact) -> Iterator[ListedOrigin]:
         """Given an artifact (tarball, file), yield one ListedOrigin."""
         assert self.lister_obj.id is not None
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             url=artifact.origin,
             visit_type=artifact.visit_type,
             extra_loader_arguments={
                 "checksums": artifact.checksums,
                 "checksums_computation": artifact.checksums_computation.value,
                 "fallback_urls": artifact.fallback_urls,
             },
         )
 
     def get_origins_from_page(
         self, artifact_tuple: PageResult
     ) -> Iterator[ListedOrigin]:
         """Given an artifact tuple (type, artifact), yield a ListedOrigin."""
         artifact_type, artifact = artifact_tuple
         mapping_type_fn = getattr(self, f"{artifact_type.value}_to_listed_origin")
         yield from mapping_type_fn(artifact)
diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py
index b940699..f10c02d 100644
--- a/swh/lister/npm/lister.py
+++ b/swh/lister/npm/lister.py
@@ -1,170 +1,176 @@
 # Copyright (C) 2018-2022 the Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import logging
 from typing import Any, Dict, Iterator, List, Optional
 
 import iso8601
 
 from swh.lister.pattern import CredentialsType, Lister
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class NpmListerState:
     """State of npm lister"""
 
     last_seq: Optional[int] = None
 
 
 class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
     """
     List all packages hosted on the npm registry.
 
     The lister is based on the npm replication API powered by a
     CouchDB database (https://docs.couchdb.org/en/stable/api/database/).
 
     Args:
         scheduler: a scheduler instance
         page_size: number of packages info to return per page when querying npm API
         incremental: defines if incremental listing should be used, in that case
             only modified or new packages since last incremental listing operation
             will be returned, otherwise all packages will be listed in lexicographical
             order
 
     """
 
     LISTER_NAME = "npm"
     INSTANCE = "npm"
 
     API_BASE_URL = "https://replicate.npmjs.com"
     API_INCREMENTAL_LISTING_URL = f"{API_BASE_URL}/_changes"
     API_FULL_LISTING_URL = f"{API_BASE_URL}/_all_docs"
     PACKAGE_URL_TEMPLATE = "https://www.npmjs.com/package/{package_name}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         page_size: int = 1000,
         incremental: bool = False,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.API_INCREMENTAL_LISTING_URL
             if incremental
             else self.API_FULL_LISTING_URL,
             instance=self.INSTANCE,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.page_size = page_size
         if not incremental:
             # in full listing mode, first package in each page corresponds to the one
             # provided as the startkey query parameter value, so we increment the page
             # size by one to avoid double package processing
             self.page_size += 1
         self.incremental = incremental
 
         self.session.headers.update({"Accept": "application/json"})
 
     def state_from_dict(self, d: Dict[str, Any]) -> NpmListerState:
         return NpmListerState(**d)
 
     def state_to_dict(self, state: NpmListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def request_params(self, last_package_id: str) -> Dict[str, Any]:
         # include package JSON document to get its last update date
         params = {"limit": self.page_size, "include_docs": "true"}
         if self.incremental:
             params["since"] = last_package_id
         else:
             params["startkey"] = last_package_id
         return params
 
     def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
         last_package_id: str = "0" if self.incremental else '""'
         if (
             self.incremental
             and self.state is not None
             and self.state.last_seq is not None
         ):
             last_package_id = str(self.state.last_seq)
 
         while True:
 
             response = self.http_request(
                 self.url, params=self.request_params(last_package_id)
             )
 
             data = response.json()
             page = data["results"] if self.incremental else data["rows"]
 
             if not page:
                 break
 
             if self.incremental or len(page) < self.page_size:
                 yield page
             else:
                 yield page[:-1]
 
             if len(page) < self.page_size:
                 break
 
             last_package_id = (
                 str(page[-1]["seq"]) if self.incremental else f'"{page[-1]["id"]}"'
             )
 
     def get_origins_from_page(
         self, page: List[Dict[str, Any]]
     ) -> Iterator[ListedOrigin]:
         """Convert a page of Npm repositories into a list of ListedOrigin."""
         assert self.lister_obj.id is not None
 
         for package in page:
             # no source code to archive here
             if not package["doc"].get("versions", {}):
                 continue
 
             package_name = package["doc"]["name"]
             package_latest_version = (
                 package["doc"].get("dist-tags", {}).get("latest", "")
             )
 
             last_update = None
             if package_latest_version in package["doc"].get("time", {}):
                 last_update = iso8601.parse_date(
                     package["doc"]["time"][package_latest_version]
                 )
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=self.PACKAGE_URL_TEMPLATE.format(package_name=package_name),
                 visit_type="npm",
                 last_update=last_update,
             )
 
     def commit_page(self, page: List[Dict[str, Any]]):
         """Update the currently stored state using the latest listed page."""
         if self.incremental:
             last_package = page[-1]
             last_seq = last_package["seq"]
 
             if self.state.last_seq is None or last_seq > self.state.last_seq:
                 self.state.last_seq = last_seq
 
     def finalize(self):
         if self.incremental and self.state.last_seq is not None:
             scheduler_state = self.get_state_from_scheduler()
 
             if (
                 scheduler_state.last_seq is None
                 or self.state.last_seq > scheduler_state.last_seq
             ):
                 self.updated = True
diff --git a/swh/lister/nuget/__init__.py b/swh/lister/nuget/__init__.py
index 73aaafa..acf0e1b 100644
--- a/swh/lister/nuget/__init__.py
+++ b/swh/lister/nuget/__init__.py
@@ -1,79 +1,86 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 """
 NuGet lister
 ============
 
 The NuGet lister discover origins from `nuget.org`_, NuGet is the package manager for .NET.
 As .NET packages mostly contains binaries, we keep only track of packages that have
 a Dvcs repository (GIT, SVN, Mercurial...) url usable as an origin.
 
 The `nuget.org/packages`_ list 301,206 packages as of September 2022.
 
 Origins retrieving strategy
 ---------------------------
 
 Nuget.org provides an `http api`_ with several endpoint to discover and list packages
 and versions.
 
-The recommended way to retrieve all packages is to use the `catalog`_ api endpoint.
-It provides a first endpoint that list all available pages. We then iterate to get
-content of related pages.
+The recommended way to `retrieve all packages`_ is to use the `catalog`_ api endpoint.
+It provides a `catalog index endpoint`_ that list all available pages. We then iterate to
+get content of related pages.
+
+The lister is incremental following a `cursor`_ principle, based on the value of
+``commitTimeStamp`` from the catalog index endpoint. It retrieve only pages for which
+``commitTimeStamp``is greater than ``lister.state.last_listing_date``.
 
 Page listing
 ------------
 
 Each page returns a list of packages which is the data of the response request.
 
 Origins from page
 -----------------
 
 For each entry in a page listing we get related metadata through its `package metadata`_
 http api endpoint. It returns uri for linked archives that contains binary, not the
 original source code. Our strategy is then to get a related GIT repository.
 
 We use another endpoint for each package to get its `package manifest`_, a .nuspec file (xml
  data) which may contains a GIT repository url. If we found one, it is used as origin.
 
 Running tests
 -------------
 
 Activate the virtualenv and run from within swh-lister directory::
 
    pytest -s -vv --log-cli-level=DEBUG swh/lister/nuget/tests
 
 Testing with Docker
 -------------------
 
 Change directory to swh/docker then launch the docker environment::
 
    docker compose up -d
 
 Then schedule a nuget listing task::
 
    docker compose exec swh-scheduler swh scheduler task add -p oneshot list-nuget
 
 You can follow lister execution by displaying logs of swh-lister service::
 
    docker compose logs -f swh-lister
 
 .. _nuget.org: https://nuget.org
 .. _nuget.org/packages: https://www.nuget.org/packages
 .. _http api: https://api.nuget.org/v3/index.json
 .. _catalog: https://learn.microsoft.com/en-us/nuget/api/catalog-resource
+.. _catalog index endpoint: https://learn.microsoft.com/en-us/nuget/api/catalog-resource#catalog-page-object-in-the-index
+.. _retrieve all packages: https://learn.microsoft.com/en-us/nuget/guides/api/query-for-all-published-packages#initialize-a-cursor
+.. _cursor: https://learn.microsoft.com/en-us/nuget/api/catalog-resource#cursor
 .. _package metadata: https://learn.microsoft.com/en-us/nuget/api/registration-base-url-resource
-.. _package manifest: https://learn.microsoft.com/en-us/nuget/api/package-base-address-resource#download-package-manifest-nuspec  # noqa: B950
-"""
+.. _package manifest: https://learn.microsoft.com/en-us/nuget/api/package-base-address-resource#download-package-manifest-nuspec
+"""  # noqa: B950
 
 
 def register():
     from .lister import NugetLister
 
     return {
         "lister": NugetLister,
         "task_modules": ["%s.tasks" % __name__],
     }
diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py
index 51652ec..98f9fc9 100644
--- a/swh/lister/nuget/lister.py
+++ b/swh/lister/nuget/lister.py
@@ -1,114 +1,164 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from dataclasses import dataclass
+from datetime import datetime
 import logging
-from typing import Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional
 
 from bs4 import BeautifulSoup
+import iso8601
 from requests.exceptions import HTTPError
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
+
 # Aliasing the page results returned by `get_pages` method from the lister.
 NugetListerPage = List[Dict[str, str]]
 
 
-class NugetLister(StatelessLister[NugetListerPage]):
+@dataclass
+class NugetListerState:
+    """Store lister state for incremental mode operations"""
+
+    last_listing_date: Optional[datetime] = None
+    """Last date from main http api endpoint when lister was executed"""
+
+
+class NugetLister(Lister[NugetListerState, NugetListerPage]):
     """List Nuget (Package manager for .NET) origins."""
 
     LISTER_NAME = "nuget"
     INSTANCE = "nuget"
 
     API_INDEX_URL = "https://api.nuget.org/v3/catalog0/index.json"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.API_INDEX_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
+        self.listing_date: Optional[datetime] = None
+
+    def state_from_dict(self, d: Dict[str, Any]) -> NugetListerState:
+        last_listing_date = d.get("last_listing_date")
+        if last_listing_date is not None:
+            d["last_listing_date"] = iso8601.parse_date(last_listing_date)
+        return NugetListerState(**d)
+
+    def state_to_dict(self, state: NugetListerState) -> Dict[str, Any]:
+        d: Dict[str, Optional[str]] = {"last_listing_date": None}
+        last_listing_date = state.last_listing_date
+        if last_listing_date is not None:
+            d["last_listing_date"] = last_listing_date.isoformat()
+        return d
 
     def get_pages(self) -> Iterator[NugetListerPage]:
         """Yield an iterator which returns 'page'
 
         It uses the following endpoint `https://api.nuget.org/v3/catalog0/index.json`
         to get a list of pages endpoint to iterate.
         """
         index_response = self.http_request(url=self.url)
         index = index_response.json()
-        assert "items" in index
 
+        assert "commitTimeStamp" in index
+        self.listing_date = iso8601.parse_date(index["commitTimeStamp"])
+
+        assert "items" in index
         for page in index["items"]:
+
             assert page["@id"]
-            try:
-                page_response = self.http_request(url=page["@id"])
-                page_data = page_response.json()
-                assert "items" in page_data
-                yield page_data["items"]
-            except HTTPError:
-                logger.warning(
-                    "Failed to fetch page %s, skipping it from listing.",
-                    page["@id"],
-                )
-                continue
+            assert page["commitTimeStamp"]
+
+            commit_timestamp = iso8601.parse_date(page["commitTimeStamp"])
+
+            if (
+                not self.state.last_listing_date
+                or commit_timestamp > self.state.last_listing_date
+            ):
+                try:
+                    page_response = self.http_request(url=page["@id"])
+                    page_data = page_response.json()
+                    assert "items" in page_data
+                    yield page_data["items"]
+                except HTTPError:
+                    logger.warning(
+                        "Failed to fetch page %s, skipping it from listing.",
+                        page["@id"],
+                    )
+                    continue
 
     def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances.
         .NET packages are binary, dll, etc. We retrieve only packages for which we can
         find a vcs repository.
 
         To check if a vcs repository exists, we need for each entry in a page to retrieve
         a .nuspec file, which is a package metadata xml file, and search for a `repository`
         value.
         """
         assert self.lister_obj.id is not None
 
         for elt in page:
             try:
                 res = self.http_request(url=elt["@id"])
             except HTTPError:
                 logger.warning(
                     "Failed to fetch page %s, skipping it from listing.",
                     elt["@id"],
                 )
                 continue
 
             data = res.json()
             pkgname = data["id"]
             nuspec_url = (
                 f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/"
                 f"{data['version'].lower()}/{pkgname.lower()}.nuspec"
             )
+
             try:
                 res_metadata = self.http_request(url=nuspec_url)
             except HTTPError:
                 logger.warning(
                     "Failed to fetch nuspec file %s, skipping it from listing.",
                     nuspec_url,
                 )
                 continue
             xml = BeautifulSoup(res_metadata.content, "xml")
             repo = xml.find("repository")
             if repo and "url" in repo.attrs and "type" in repo.attrs:
                 vcs_url = repo.attrs["url"]
                 vcs_type = repo.attrs["type"]
+                last_update = iso8601.parse_date(elt["commitTimeStamp"])
                 yield ListedOrigin(
                     lister_id=self.lister_obj.id,
                     visit_type=vcs_type,
                     url=vcs_url,
-                    last_update=None,
+                    last_update=last_update,
                 )
             else:
                 continue
+
+    def finalize(self) -> None:
+        self.state.last_listing_date = self.listing_date
+        self.updated = True
diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec b/swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec
new file mode 100644
index 0000000..29a4a55
--- /dev/null
+++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3-flatcontainer_moq.automock_3.5.0-ci0287_moq.automock.nuspec
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8"?>
+<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
+  <metadata>
+    <id>Moq.AutoMock</id>
+    <version>3.5.0-ci0287</version>
+    <authors>Tim Kellogg, Adam Hewitt, Kevin Bost</authors>
+    <license type="file">LICENSE</license>
+    <licenseUrl>https://aka.ms/deprecateLicenseUrl</licenseUrl>
+    <projectUrl>https://github.com/moq/Moq.AutoMocker</projectUrl>
+    <description>An auto-mocking container that generates mocks using Moq</description>
+    <copyright>Copyright Tim Kellogg 2022</copyright>
+    <repository type="git" url="https://github.com/moq/Moq.AutoMocker" commit="5a8b5ab20a68dd549428a602e4c7e81434f3a906" />
+    <dependencies>
+      <group targetFramework=".NETFramework4.6.1">
+        <dependency id="Moq" version="4.18.2" exclude="Build,Analyzers" />
+        <dependency id="NonBlocking" version="2.1.0" exclude="Build,Analyzers" />
+        <dependency id="System.ValueTuple" version="4.5.0" exclude="Build,Analyzers" />
+      </group>
+      <group targetFramework=".NETStandard2.0">
+        <dependency id="Moq" version="4.18.2" exclude="Build,Analyzers" />
+        <dependency id="NonBlocking" version="2.1.0" exclude="Build,Analyzers" />
+      </group>
+    </dependencies>
+  </metadata>
+</package>
\ No newline at end of file
diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json
new file mode 100644
index 0000000..878b828
--- /dev/null
+++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_data_2022.10.10.04.04.00_moq.automock.3.5.0-ci0287.json
@@ -0,0 +1,187 @@
+{
+  "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json",
+  "@type": [
+    "PackageDetails",
+    "catalog:Permalink"
+  ],
+  "authors": "Tim Kellogg, Adam Hewitt, Kevin Bost",
+  "catalog:commitId": "de4b22b8-397b-4fa1-a160-db3a7c5b17cd",
+  "catalog:commitTimeStamp": "2022-10-10T04:04:00.6654802Z",
+  "copyright": "Copyright Tim Kellogg 2022",
+  "created": "2022-10-10T04:01:52.21Z",
+  "description": "An auto-mocking container that generates mocks using Moq",
+  "id": "Moq.AutoMock",
+  "isPrerelease": true,
+  "lastEdited": "2022-10-10T04:03:52.51Z",
+  "licenseFile": "LICENSE",
+  "licenseUrl": "https://aka.ms/deprecateLicenseUrl",
+  "listed": true,
+  "packageHash": "jtvxZ9lJGiNWCvKx4oZByy/knRu86ze833hZa2XvAbzYcSR3gSesdWgbGw1yNGDY0TuHobTETq/lorrtE2/pPA==",
+  "packageHashAlgorithm": "SHA512",
+  "packageSize": 70853,
+  "projectUrl": "https://github.com/moq/Moq.AutoMocker",
+  "published": "2022-10-10T04:01:52.21Z",
+  "repository": "",
+  "verbatimVersion": "3.5.0-ci0287",
+  "version": "3.5.0-ci0287",
+  "dependencyGroups": [
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1",
+      "@type": "PackageDependencyGroup",
+      "dependencies": [
+        {
+          "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/moq",
+          "@type": "PackageDependency",
+          "id": "Moq",
+          "range": "[4.18.2, )"
+        },
+        {
+          "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/nonblocking",
+          "@type": "PackageDependency",
+          "id": "NonBlocking",
+          "range": "[2.1.0, )"
+        },
+        {
+          "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/system.valuetuple",
+          "@type": "PackageDependency",
+          "id": "System.ValueTuple",
+          "range": "[4.5.0, )"
+        }
+      ],
+      "targetFramework": ".NETFramework4.6.1"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0",
+      "@type": "PackageDependencyGroup",
+      "dependencies": [
+        {
+          "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0/moq",
+          "@type": "PackageDependency",
+          "id": "Moq",
+          "range": "[4.18.2, )"
+        },
+        {
+          "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0/nonblocking",
+          "@type": "PackageDependency",
+          "id": "NonBlocking",
+          "range": "[2.1.0, )"
+        }
+      ],
+      "targetFramework": ".NETStandard2.0"
+    }
+  ],
+  "packageEntries": [
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#Moq.AutoMock.nuspec",
+      "@type": "PackageEntry",
+      "compressedLength": 567,
+      "fullName": "Moq.AutoMock.nuspec",
+      "length": 1287,
+      "name": "Moq.AutoMock.nuspec"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/net461/Moq.AutoMock.dll",
+      "@type": "PackageEntry",
+      "compressedLength": 17993,
+      "fullName": "lib/net461/Moq.AutoMock.dll",
+      "length": 41984,
+      "name": "Moq.AutoMock.dll"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/net461/Moq.AutoMock.xml",
+      "@type": "PackageEntry",
+      "compressedLength": 5031,
+      "fullName": "lib/net461/Moq.AutoMock.xml",
+      "length": 55041,
+      "name": "Moq.AutoMock.xml"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/netstandard2.0/Moq.AutoMock.dll",
+      "@type": "PackageEntry",
+      "compressedLength": 17927,
+      "fullName": "lib/netstandard2.0/Moq.AutoMock.dll",
+      "length": 41984,
+      "name": "Moq.AutoMock.dll"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/netstandard2.0/Moq.AutoMock.xml",
+      "@type": "PackageEntry",
+      "compressedLength": 5031,
+      "fullName": "lib/netstandard2.0/Moq.AutoMock.xml",
+      "length": 55041,
+      "name": "Moq.AutoMock.xml"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#LICENSE",
+      "@type": "PackageEntry",
+      "compressedLength": 628,
+      "fullName": "LICENSE",
+      "length": 1068,
+      "name": "LICENSE"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#analyzers/dotnet/cs/Moq.AutoMocker.TestGenerator.dll",
+      "@type": "PackageEntry",
+      "compressedLength": 9686,
+      "fullName": "analyzers/dotnet/cs/Moq.AutoMocker.TestGenerator.dll",
+      "length": 25088,
+      "name": "Moq.AutoMocker.TestGenerator.dll"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#.signature.p7s",
+      "@type": "PackageEntry",
+      "compressedLength": 11534,
+      "fullName": ".signature.p7s",
+      "length": 11534,
+      "name": ".signature.p7s"
+    }
+  ],
+  "@context": {
+    "@vocab": "http://schema.nuget.org/schema#",
+    "catalog": "http://schema.nuget.org/catalog#",
+    "xsd": "http://www.w3.org/2001/XMLSchema#",
+    "dependencies": {
+      "@id": "dependency",
+      "@container": "@set"
+    },
+    "dependencyGroups": {
+      "@id": "dependencyGroup",
+      "@container": "@set"
+    },
+    "packageEntries": {
+      "@id": "packageEntry",
+      "@container": "@set"
+    },
+    "packageTypes": {
+      "@id": "packageType",
+      "@container": "@set"
+    },
+    "supportedFrameworks": {
+      "@id": "supportedFramework",
+      "@container": "@set"
+    },
+    "tags": {
+      "@id": "tag",
+      "@container": "@set"
+    },
+    "vulnerabilities": {
+      "@id": "vulnerability",
+      "@container": "@set"
+    },
+    "published": {
+      "@type": "xsd:dateTime"
+    },
+    "created": {
+      "@type": "xsd:dateTime"
+    },
+    "lastEdited": {
+      "@type": "xsd:dateTime"
+    },
+    "catalog:commitTimeStamp": {
+      "@type": "xsd:dateTime"
+    },
+    "reasons": {
+      "@container": "@set"
+    }
+  }
+}
\ No newline at end of file
diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1 b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1
new file mode 100644
index 0000000..9f40584
--- /dev/null
+++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_index.json_visit1
@@ -0,0 +1,46 @@
+{
+  "@id": "https://api.nuget.org/v3/catalog0/index.json",
+  "@type": [
+    "CatalogRoot",
+    "AppendOnlyCatalog",
+    "Permalink"
+  ],
+  "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
+  "commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
+  "count": 16959,
+  "nuget:lastCreated": "2022-10-10T04:20:52.8660454Z",
+  "nuget:lastDeleted": "2022-10-10T04:20:52.8660454Z",
+  "nuget:lastEdited": "2022-10-10T04:20:52.8660454Z",
+  "items": [
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/page17100.json",
+      "@type": "CatalogPage",
+      "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
+      "commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
+      "count": 545
+    }
+  ],
+  "@context": {
+    "@vocab": "http://schema.nuget.org/catalog#",
+    "nuget": "http://schema.nuget.org/schema#",
+    "items": {
+      "@id": "item",
+      "@container": "@set"
+    },
+    "parent": {
+      "@type": "@id"
+    },
+    "commitTimeStamp": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "nuget:lastCreated": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "nuget:lastEdited": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "nuget:lastDeleted": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    }
+  }
+}
diff --git a/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json
new file mode 100644
index 0000000..2e7eb13
--- /dev/null
+++ b/swh/lister/nuget/tests/data/https_api.nuget.org/v3_catalog0_page17100.json
@@ -0,0 +1,49 @@
+{
+  "@id": "https://api.nuget.org/v3/catalog0/page17100.json",
+  "@type": "CatalogPage",
+  "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
+  "commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
+  "count": 545,
+  "parent": "https://api.nuget.org/v3/catalog0/index.json",
+  "items": [
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json",
+      "@type": "nuget:PackageDetails",
+      "commitId": "de4b22b8-397b-4fa1-a160-db3a7c5b17cd",
+      "commitTimeStamp": "2022-10-10T04:04:00.6654802Z",
+      "nuget:id": "Moq.AutoMock",
+      "nuget:version": "3.5.0-ci0287"
+    },
+    {
+      "@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.20.52/alzabox.api.sdk.0.0.13.json",
+      "@type": "nuget:PackageDetails",
+      "commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
+      "commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
+      "nuget:id": "Alzabox.API.SDK",
+      "nuget:version": "0.0.13"
+    }
+  ],
+  "@context": {
+    "@vocab": "http://schema.nuget.org/catalog#",
+    "nuget": "http://schema.nuget.org/schema#",
+    "items": {
+      "@id": "item",
+      "@container": "@set"
+    },
+    "parent": {
+      "@type": "@id"
+    },
+    "commitTimeStamp": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "nuget:lastCreated": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "nuget:lastEdited": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "nuget:lastDeleted": {
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    }
+  }
+}
diff --git a/swh/lister/nuget/tests/test_lister.py b/swh/lister/nuget/tests/test_lister.py
index 8c94c8e..a5488ff 100644
--- a/swh/lister/nuget/tests/test_lister.py
+++ b/swh/lister/nuget/tests/test_lister.py
@@ -1,34 +1,131 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from swh.lister.nuget.lister import NugetLister
 
 expected_origins = ["https://github.com/sillsdev/libpalaso.git"]
+expected_origins_incremental = ["https://github.com/moq/Moq.AutoMocker"]
 
 
 def test_nuget_lister(datadir, requests_mock_datadir, swh_scheduler):
     lister = NugetLister(scheduler=swh_scheduler)
     res = lister.run()
 
     assert res.pages == 2
     assert res.origins == 1
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     assert len(scheduler_origins) == len(expected_origins)
 
     assert [
         (
             scheduled.visit_type,
             scheduled.url,
         )
         for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
     ] == [
         (
             "git",
             url,
         )
         for url in expected_origins
     ]
+
+
+def test_nuget_lister_incremental(datadir, requests_mock_datadir_visits, swh_scheduler):
+    # First run
+    lister = NugetLister(scheduler=swh_scheduler)
+    assert lister.state.last_listing_date is None
+
+    res = lister.run()
+    assert res.pages == 2
+    assert res.origins == 1
+    assert lister.state.last_listing_date
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    assert [
+        (
+            scheduled.visit_type,
+            scheduled.url,
+        )
+        for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
+    ] == [
+        (
+            "git",
+            url,
+        )
+        for url in expected_origins
+    ]
+
+    last_date = lister.state.last_listing_date
+
+    # Second run
+    lister = NugetLister(scheduler=swh_scheduler)
+    assert lister.state.last_listing_date == last_date
+    res = lister.run()
+    # One page and one new origin
+    assert lister.state.last_listing_date > last_date
+    assert res.pages == 1
+    assert res.origins == 1
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    assert [
+        (
+            scheduled.visit_type,
+            scheduled.url,
+        )
+        for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
+    ] == [
+        (
+            "git",
+            url,
+        )
+        for url in sorted(expected_origins + expected_origins_incremental)
+    ]
+
+
+def test_nuget_lister_incremental_no_changes(
+    datadir, requests_mock_datadir, swh_scheduler
+):
+    # First run
+    lister = NugetLister(scheduler=swh_scheduler)
+    assert lister.state.last_listing_date is None
+
+    res = lister.run()
+    assert res.pages == 2
+    assert res.origins == 1
+    assert lister.state.last_listing_date
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    assert len(scheduler_origins) == len(expected_origins)
+
+    assert [
+        (
+            scheduled.visit_type,
+            scheduled.url,
+        )
+        for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
+    ] == [
+        (
+            "git",
+            url,
+        )
+        for url in expected_origins
+    ]
+
+    last_date = lister.state.last_listing_date
+
+    # Second run
+    lister = NugetLister(scheduler=swh_scheduler)
+    assert lister.state.last_listing_date == last_date
+    res = lister.run()
+    # Nothing new
+    assert lister.state.last_listing_date == last_date
+    assert res.pages == 0
+    assert res.origins == 0
diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py
index 724d198..6b54e66 100644
--- a/swh/lister/opam/lister.py
+++ b/swh/lister/opam/lister.py
@@ -1,144 +1,150 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import io
 import logging
 import os
 from subprocess import PIPE, Popen, call
 from typing import Any, Dict, Iterator, Optional
 
 from swh.lister.pattern import StatelessLister
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType
 
 logger = logging.getLogger(__name__)
 
 PageType = str
 
 
 class OpamLister(StatelessLister[PageType]):
     """
     List all repositories hosted on an opam repository.
 
     On initialisation, we create an opam root, with no ocaml compiler (no switch)
     as we won't need it and it's costly. In this opam root, we add a single opam
     repository (url) and give it a name (instance). Then, to get pages, we just ask
     opam to list all the packages for our opam repository in our opam root.
 
     Args:
         url: base URL of an opam repository
             (for instance https://opam.ocaml.org)
         instance: string identifier for the listed repository
 
     """
 
     # Part of the lister API, that identifies this lister
     LISTER_NAME = "opam"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: Optional[str] = None,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         opam_root: str = "/tmp/opam/",
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=url,
             instance=instance,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
         self.env = os.environ.copy()
         # Opam root folder is initialized in the :meth:`get_pages` method as no
         # side-effect should happen in the constructor to ease instantiation
         self.opam_root = opam_root
 
     def get_pages(self) -> Iterator[PageType]:
         # Initialize the opam root directory
         opam_init(self.opam_root, self.instance, self.url, self.env)
 
         # Actually list opam instance data
         proc = Popen(
             [
                 "opam",
                 "list",
                 "--all",
                 "--no-switch",
                 "--safe",
                 "--repos",
                 self.instance,
                 "--root",
                 self.opam_root,
                 "--normalise",
                 "--short",
             ],
             env=self.env,
             stdout=PIPE,
         )
         if proc.stdout is not None:
             for line in io.TextIOWrapper(proc.stdout):
                 yield line.rstrip("\n")
 
     def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
         """Convert a page of OpamLister repositories into a list of ListedOrigins"""
         assert self.lister_obj.id is not None
         # a page is just a package name
         url = f"opam+{self.url}/packages/{page}/"
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type="opam",
             url=url,
             last_update=None,
             extra_loader_arguments={
                 "opam_root": self.opam_root,
                 "opam_instance": self.instance,
                 "opam_url": self.url,
                 "opam_package": page,
             },
         )
 
 
 def opam_init(opam_root: str, instance: str, url: str, env: Dict[str, Any]) -> None:
     """Initialize an opam_root folder.
 
     Args:
         opam_root: The opam root folder to initialize
         instance: Name of the opam repository to add or initialize
         url: The associated url of the opam repository to add or initialize
         env: The global environment to use for the opam command.
 
     Returns:
         None.
 
     """
     if not os.path.exists(opam_root) or not os.listdir(opam_root):
         command = [
             "opam",
             "init",
             "--reinit",
             "--bare",
             "--no-setup",
             "--root",
             opam_root,
             instance,
             url,
         ]
     else:
         # The repository exists and is populated, we just add another instance in the
         # repository. If it's already setup, it's a noop
         command = [
             "opam",
             "repository",
             "add",
             "--root",
             opam_root,
             instance,
             url,
         ]
     # Actually execute the command
     call(command, env=env)
diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py
index e9fa296..af57b55 100644
--- a/swh/lister/packagist/lister.py
+++ b/swh/lister/packagist/lister.py
@@ -1,176 +1,182 @@
 # Copyright (C) 2019-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import dataclass
 from datetime import datetime, timezone
 import logging
 from typing import Any, Dict, Iterator, List, Optional
 
 import iso8601
 import requests
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 PackagistPageType = List[str]
 
 
 @dataclass
 class PackagistListerState:
     """State of Packagist lister"""
 
     last_listing_date: Optional[datetime] = None
     """Last date when packagist lister was executed"""
 
 
 class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
     """
     List all Packagist projects and send associated origins to scheduler.
 
     The lister queries the Packagist API, whose documentation can be found at
     https://packagist.org/apidoc.
 
     For each package, its metadata are retrieved using Packagist API endpoints
     whose responses are served from static files, which are guaranteed to be
     efficient on the Packagist side (no dymamic queries).
     Furthermore, subsequent listing will send the "If-Modified-Since" HTTP
     header to only retrieve packages metadata updated since the previous listing
     operation in order to save bandwidth and return only origins which might have
     new released versions.
     """
 
     LISTER_NAME = "Packagist"
     PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json"
     PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url=self.PACKAGIST_PACKAGES_LIST_URL,
             instance="packagist",
             credentials=credentials,
             with_github_session=True,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/json"})
         self.listing_date = datetime.now().astimezone(tz=timezone.utc)
 
     def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
         last_listing_date = d.get("last_listing_date")
         if last_listing_date is not None:
             d["last_listing_date"] = iso8601.parse_date(last_listing_date)
         return PackagistListerState(**d)
 
     def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]:
         d: Dict[str, Optional[str]] = {"last_listing_date": None}
         last_listing_date = state.last_listing_date
         if last_listing_date is not None:
             d["last_listing_date"] = last_listing_date.isoformat()
         return d
 
     def api_request(self, url: str) -> Any:
         response = self.http_request(url)
         # response is empty when status code is 304
         return response.json() if response.status_code == 200 else {}
 
     def get_pages(self) -> Iterator[PackagistPageType]:
         """
         Yield a single page listing all Packagist projects.
         """
         yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"]
 
     def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]:
         """
         Iterate on all Packagist projects and yield ListedOrigin instances.
         """
         assert self.lister_obj.id is not None
 
         # save some bandwidth by only getting packages metadata updated since
         # last listing
         if self.state.last_listing_date is not None:
             if_modified_since = self.state.last_listing_date.strftime(
                 "%a, %d %b %Y %H:%M:%S GMT"
             )
             self.session.headers["If-Modified-Since"] = if_modified_since
 
         # to ensure origins will not be listed multiple times
         origin_urls = set()
 
         for package_name in page:
             try:
                 metadata = self.api_request(
                     f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json"
                 )
                 if not metadata.get("packages", {}):
                     # package metadata not updated since last listing
                     continue
                 if package_name not in metadata["packages"]:
                     # missing package metadata in response
                     continue
                 versions_info = metadata["packages"][package_name].values()
             except requests.HTTPError:
                 # error when getting package metadata (usually 404 when a
                 # package has been removed), skip it and process next package
                 continue
 
             origin_url = None
             visit_type = None
             last_update = None
 
             # extract origin url for package, vcs type and latest release date
             for version_info in versions_info:
                 origin_url = version_info.get("source", {}).get("url", "")
                 if not origin_url:
                     continue
                 # can be git, hg or svn
                 visit_type = version_info.get("source", {}).get("type", "")
                 dist_time_str = version_info.get("time", "")
                 if not dist_time_str:
                     continue
                 dist_time = iso8601.parse_date(dist_time_str)
                 if last_update is None or dist_time > last_update:
                     last_update = dist_time
 
             # skip package with already seen origin url or with missing required info
             if visit_type is None or origin_url is None or origin_url in origin_urls:
                 continue
 
             if visit_type == "git":
                 # Non-github urls will be returned as is, github ones will be canonical
                 # ones
                 assert self.github_session is not None
                 origin_url = (
                     self.github_session.get_canonical_url(origin_url) or origin_url
                 )
 
             # bitbucket closed its mercurial hosting service, those origins can not be
             # loaded into the archive anymore
             if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"):
                 continue
 
             origin_urls.add(origin_url)
 
             logger.debug(
                 "Found package %s last updated on %s", package_name, last_update
             )
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
                 visit_type=visit_type,
                 last_update=last_update,
             )
 
     def finalize(self) -> None:
         self.state.last_listing_date = self.listing_date
         self.updated = True
diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py
index 8a1b497..621b643 100644
--- a/swh/lister/pattern.py
+++ b/swh/lister/pattern.py
@@ -1,339 +1,368 @@
 # Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from __future__ import annotations
 
 from dataclasses import dataclass
 import logging
 from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Set, TypeVar
 from urllib.parse import urlparse
 
+import attr
 import requests
 from tenacity.before_sleep import before_sleep_log
 
 from swh.core.config import load_from_envvar
 from swh.core.github.utils import GitHubSession
 from swh.core.utils import grouper
 from swh.scheduler import get_scheduler, model
 from swh.scheduler.interface import SchedulerInterface
 
 from . import USER_AGENT_TEMPLATE
 from .utils import http_retry, is_valid_origin_url
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class ListerStats:
     pages: int = 0
     origins: int = 0
 
     def __add__(self, other: ListerStats) -> ListerStats:
         return self.__class__(self.pages + other.pages, self.origins + other.origins)
 
     def __iadd__(self, other: ListerStats):
         self.pages += other.pages
         self.origins += other.origins
 
     def dict(self) -> Dict[str, int]:
         return {"pages": self.pages, "origins": self.origins}
 
 
 StateType = TypeVar("StateType")
 PageType = TypeVar("PageType")
 
 BackendStateType = Dict[str, Any]
 CredentialsType = Optional[Dict[str, Dict[str, List[Dict[str, str]]]]]
 
 
 class Lister(Generic[StateType, PageType]):
     """The base class for a Software Heritage lister.
 
     A lister scrapes a page by page list of origins from an upstream (a forge, the API
     of a package manager, ...), and massages the results of that scrape into a list of
     origins that are recorded by the scheduler backend.
 
     The main loop of the lister, :meth:`run`, basically revolves around the
     :meth:`get_pages` iterator, which sets up the lister state, then yields the scrape
     results page by page. The :meth:`get_origins_from_page` method converts the pages
     into a list of :class:`model.ListedOrigin`, sent to the scheduler at every page. The
     :meth:`commit_page` method can be used to update the lister state after a page of
     origins has been recorded in the scheduler backend.
 
     The :func:`finalize` method is called at lister teardown (whether the run has
     been successful or not) to update the local :attr:`state` object before it's sent to
     the database. This method must set the :attr:`updated` attribute if an updated
     state needs to be sent to the scheduler backend. This method can call
     :func:`get_state_from_scheduler` to refresh and merge the lister state from the
     scheduler before it's finalized (and potentially minimize the risk of race
     conditions between concurrent runs of the lister).
 
     The state of the lister is serialized and deserialized from the dict stored in the
     scheduler backend, using the :meth:`state_from_dict` and :meth:`state_to_dict`
     methods.
 
     Args:
       scheduler: the instance of the Scheduler being used to register the
         origins listed by this lister
       url: a URL representing this lister, e.g. the API's base URL
       instance: the instance name, to uniquely identify this lister instance,
         if not provided the URL network location will be used
       credentials: dictionary of credentials for all listers. The first level
         identifies the :attr:`LISTER_NAME`, the second level the lister
         :attr:`instance`. The final level is a list of dicts containing the
         expected credentials for the given instance of that lister.
+      max_pages: the maximum number of pages listed in a full listing operation
+      max_origins_per_page: the maximum number of origins processed per page
+      enable_origins: whether the created origins should be enabled or not
 
     Generic types:
       - *StateType*: concrete lister type; should usually be a :class:`dataclass` for
         stricter typing
       - *PageType*: type of scrape results; can usually be a :class:`requests.Response`,
         or a :class:`dict`
 
     """
 
     LISTER_NAME: str = ""
     github_session: Optional[GitHubSession] = None
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: Optional[str] = None,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
         with_github_session: bool = False,
     ):
         if not self.LISTER_NAME:
             raise ValueError("Must set the LISTER_NAME attribute on Lister classes")
 
         self.url = url
         if instance is not None:
             self.instance = instance
         else:
             self.instance = urlparse(url).netloc
 
         self.scheduler = scheduler
 
         if not credentials:
             credentials = {}
         self.credentials = list(
             credentials.get(self.LISTER_NAME, {}).get(self.instance, [])
         )
 
         # store the initial state of the lister
         self.state = self.get_state_from_scheduler()
         self.updated = False
 
         self.session = requests.Session()
         # Declare the USER_AGENT is more sysadm-friendly for the forge we list
         self.session.headers.update(
             {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME}
         )
         self.github_session: Optional[GitHubSession] = (
             GitHubSession(
                 credentials=credentials.get("github", {}).get("github", []),
                 user_agent=str(self.session.headers["User-Agent"]),
             )
             if with_github_session
             else None
         )
 
         self.recorded_origins: Set[str] = set()
+        self.max_pages = max_pages
+        self.max_origins_per_page = max_origins_per_page
+        self.enable_origins = enable_origins
 
     @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
     def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:
 
         logger.debug("Fetching URL %s with params %s", url, kwargs.get("params"))
 
         response = self.session.request(method, url, **kwargs)
         if response.status_code not in (200, 304):
             logger.warning(
                 "Unexpected HTTP status code %s on %s: %s",
                 response.status_code,
                 response.url,
                 response.content,
             )
         response.raise_for_status()
 
         return response
 
     def run(self) -> ListerStats:
         """Run the lister.
 
         Returns:
           A counter with the number of pages and origins seen for this run
           of the lister.
 
         """
         full_stats = ListerStats()
         self.recorded_origins = set()
 
         try:
             for page in self.get_pages():
                 full_stats.pages += 1
-                origins = self.get_origins_from_page(page)
+                origins = list(self.get_origins_from_page(page))
+                if (
+                    self.max_origins_per_page
+                    and len(origins) > self.max_origins_per_page
+                ):
+                    logger.info(
+                        "Max origins per page set, truncated %s page results down to %s",
+                        len(origins),
+                        self.max_origins_per_page,
+                    )
+                    origins = origins[: self.max_origins_per_page]
+                if not self.enable_origins:
+                    logger.info(
+                        "Disabling origins before sending them to the scheduler"
+                    )
+                    origins = [attr.evolve(origin, enabled=False) for origin in origins]
                 sent_origins = self.send_origins(origins)
                 self.recorded_origins.update(sent_origins)
                 full_stats.origins = len(self.recorded_origins)
                 self.commit_page(page)
+
+                if self.max_pages and full_stats.pages >= self.max_pages:
+                    logger.info("Reached page limit of %s, terminating", self.max_pages)
+                    break
         finally:
             self.finalize()
             if self.updated:
                 self.set_state_in_scheduler()
 
         return full_stats
 
     def get_state_from_scheduler(self) -> StateType:
         """Update the state in the current instance from the state in the scheduler backend.
 
         This updates :attr:`lister_obj`, and returns its (deserialized) current state,
         to allow for comparison with the local state.
 
         Returns:
           the state retrieved from the scheduler backend
         """
         self.lister_obj = self.scheduler.get_or_create_lister(
             name=self.LISTER_NAME, instance_name=self.instance
         )
         return self.state_from_dict(self.lister_obj.current_state)
 
     def set_state_in_scheduler(self) -> None:
         """Update the state in the scheduler backend from the state of the current
         instance.
 
         Raises:
           swh.scheduler.exc.StaleData: in case of a race condition between
             concurrent listers (from :meth:`swh.scheduler.Scheduler.update_lister`).
         """
         self.lister_obj.current_state = self.state_to_dict(self.state)
         self.lister_obj = self.scheduler.update_lister(self.lister_obj)
 
     # State management to/from the scheduler
 
     def state_from_dict(self, d: BackendStateType) -> StateType:
         """Convert the state stored in the scheduler backend (as a dict),
         to the concrete StateType for this lister."""
         raise NotImplementedError
 
     def state_to_dict(self, state: StateType) -> BackendStateType:
         """Convert the StateType for this lister to its serialization as dict for
         storage in the scheduler.
 
         Values must be JSON-compatible as that's what the backend database expects.
         """
         raise NotImplementedError
 
     def finalize(self) -> None:
         """Custom hook to finalize the lister state before returning from the main loop.
 
         This method must set :attr:`updated` if the lister has done some work.
 
         If relevant, this method can use :meth`get_state_from_scheduler` to merge the
         current lister state with the one from the scheduler backend, reducing the risk
         of race conditions if we're running concurrent listings.
 
         This method is called in a `finally` block, which means it will also run when
         the lister fails.
 
         """
         pass
 
     # Actual listing logic
 
     def get_pages(self) -> Iterator[PageType]:
         """Retrieve a list of pages of listed results. This is the main loop of the lister.
 
         Returns:
           an iterator of raw pages fetched from the platform currently being listed.
         """
         raise NotImplementedError
 
     def get_origins_from_page(self, page: PageType) -> Iterator[model.ListedOrigin]:
         """Extract a list of :class:`model.ListedOrigin` from a raw page of results.
 
         Args:
           page: a single page of results
         Returns:
           an iterator for the origins present on the given page of results
         """
         raise NotImplementedError
 
     def commit_page(self, page: PageType) -> None:
         """Custom hook called after the current page has been committed in the scheduler
         backend.
 
         This method can be used to update the state after a page of origins has been
         successfully recorded in the scheduler backend. If the new state should be
         recorded at the point the lister completes, the :attr:`updated` attribute must
         be set.
 
         """
         pass
 
     def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]:
         """Record a list of :class:`model.ListedOrigin` in the scheduler.
 
         Returns:
           the list of origin URLs recorded in scheduler database
         """
         valid_origins = []
         for origin in origins:
             if is_valid_origin_url(origin.url):
                 valid_origins.append(origin)
             else:
                 logger.warning("Skipping invalid origin: %s", origin.url)
 
         recorded_origins = []
         for batch_origins in grouper(valid_origins, n=1000):
             ret = self.scheduler.record_listed_origins(batch_origins)
             recorded_origins += [origin.url for origin in ret]
 
         return recorded_origins
 
     @classmethod
     def from_config(cls, scheduler: Dict[str, Any], **config: Any):
         """Instantiate a lister from a configuration dict.
 
         This is basically a backwards-compatibility shim for the CLI.
 
         Args:
           scheduler: instantiation config for the scheduler
           config: the configuration dict for the lister, with the following keys:
             - credentials (optional): credentials list for the scheduler
             - any other kwargs passed to the lister.
 
         Returns:
           the instantiated lister
         """
         # Drop the legacy config keys which aren't used for this generation of listers.
         for legacy_key in ("storage", "lister", "celery"):
             config.pop(legacy_key, None)
 
         # Instantiate the scheduler
         scheduler_instance = get_scheduler(**scheduler)
 
         return cls(scheduler=scheduler_instance, **config)
 
     @classmethod
     def from_configfile(cls, **kwargs: Any):
         """Instantiate a lister from the configuration loaded from the
         SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments
         if their value is not None.
 
         Args:
             kwargs: kwargs passed to the lister instantiation
         """
         config = dict(load_from_envvar())
         config.update({k: v for k, v in kwargs.items() if v is not None})
         return cls.from_config(**config)
 
 
 class StatelessLister(Lister[None, PageType], Generic[PageType]):
     def state_from_dict(self, d: BackendStateType) -> None:
         """Always return empty state"""
         return None
 
     def state_to_dict(self, state: None) -> BackendStateType:
         """Always set empty state"""
         return {}
diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py
index 4556178..651dc8e 100644
--- a/swh/lister/phabricator/lister.py
+++ b/swh/lister/phabricator/lister.py
@@ -1,165 +1,174 @@
 # Copyright (C) 2019-2022 the Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
 import logging
 import random
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin
 
 from swh.lister.pattern import CredentialsType, StatelessLister
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 
 PageType = List[Dict[str, Any]]
 
 
 class PhabricatorLister(StatelessLister[PageType]):
     """
     List all repositories hosted on a Phabricator instance.
 
     Args:
         url: base URL of a phabricator forge
             (for instance https://forge.softwareheritage.org)
         instance: string identifier for the listed forge,
             URL network location will be used if not provided
         api_token: authentication token for Conduit API
     """
 
     LISTER_NAME = "phabricator"
     API_REPOSITORY_PATH = "/api/diffusion.repository.search"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: Optional[str] = None,
         api_token: Optional[str] = None,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
-            scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials
+            scheduler=scheduler,
+            url=urljoin(url, self.API_REPOSITORY_PATH),
+            instance=instance,
+            credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/json"})
 
         if api_token is not None:
             self.api_token = api_token
         else:
             if not self.credentials:
                 raise ValueError(
                     f"No credentials found for phabricator instance {self.instance};"
                     " Please set them in the lister configuration file."
                 )
 
             self.api_token = random.choice(self.credentials)["password"]
 
     def get_request_params(self, after: Optional[str]) -> Dict[str, str]:
         """Get the query parameters for the request."""
 
         base_params = {
             # Stable order
             "order": "oldest",
             # Add all URIs to the response
             "attachments[uris]": "1",
             # API token from stored credentials
             "api.token": self.api_token,
         }
 
         if after is not None:
             base_params["after"] = after
 
         return base_params
 
     @staticmethod
     def filter_params(params: Dict[str, str]) -> Dict[str, str]:
         """Filter the parameters for debug purposes"""
         return {
             k: (v if k != "api.token" else "**redacted**") for k, v in params.items()
         }
 
     def get_pages(self) -> Iterator[PageType]:
         after: Optional[str] = None
         while True:
             params = self.get_request_params(after)
             response = self.http_request(self.url, method="POST", data=params)
 
             response_data = response.json()
 
             if response_data.get("result") is None:
                 logger.warning(
                     "Got unexpected response on %s: %s",
                     response.url,
                     response_data,
                 )
                 break
 
             result = response_data["result"]
 
             yield result["data"]
             after = None
             if "cursor" in result and "after" in result["cursor"]:
                 after = result["cursor"]["after"]
 
             if not after:
                 logger.debug("Empty `after` cursor. All done")
                 break
 
     def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
 
         for repo in page:
             url = get_repo_url(repo["attachments"]["uris"]["uris"])
             if url is None:
                 short_name: Optional[str] = None
 
                 for field in "shortName", "name", "callsign":
                     short_name = repo["fields"].get(field)
                     if short_name:
                         break
 
                 logger.warning(
                     "No valid url for repository [%s] (phid=%s)",
                     short_name or repo["phid"],
                     repo["phid"],
                 )
                 continue
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=url,
                 visit_type=repo["fields"]["vcs"],
                 # The "dateUpdated" field returned by the Phabricator API only refers to
                 # the repository metadata; We can't use it for our purposes.
                 last_update=None,
             )
 
 
 def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[str]:
     """
     Return url for a hosted repository from its uris attachments according
     to the following priority lists:
     * protocol: https > http
     * identifier: shortname > callsign > id
     """
     processed_urls = defaultdict(dict)  # type: Dict[str, Any]
     for uri in attachments:
         protocol = uri["fields"]["builtin"]["protocol"]
         url = uri["fields"]["uri"]["effective"]
         identifier = uri["fields"]["builtin"]["identifier"]
         if protocol in ("http", "https"):
             processed_urls[protocol][identifier] = url
         elif protocol is None:
             for protocol in ("https", "http"):
                 if url.startswith(protocol):
                     processed_urls[protocol]["undefined"] = url
                 break
     for protocol in ["https", "http"]:
         for identifier in ["shortname", "callsign", "id", "undefined"]:
             if protocol in processed_urls and identifier in processed_urls[protocol]:
                 return processed_urls[protocol][identifier]
     return None
diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py
index fd1dc45..50e4f15 100644
--- a/swh/lister/pubdev/lister.py
+++ b/swh/lister/pubdev/lister.py
@@ -1,94 +1,100 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 from typing import Iterator, List, Optional
 
 import iso8601
 from requests.exceptions import HTTPError
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 PubDevListerPage = List[str]
 
 
 class PubDevLister(StatelessLister[PubDevListerPage]):
     """List pub.dev (Dart, Flutter) origins."""
 
     LISTER_NAME = "pubdev"
     VISIT_TYPE = "pubdev"
     INSTANCE = "pubdev"
 
     BASE_URL = "https://pub.dev/"
     PACKAGE_NAMES_URL_PATTERN = "{base_url}api/package-names"
     PACKAGE_INFO_URL_PATTERN = "{base_url}api/packages/{pkgname}"
     ORIGIN_URL_PATTERN = "{base_url}packages/{pkgname}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.BASE_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/json"})
 
     def get_pages(self) -> Iterator[PubDevListerPage]:
         """Yield an iterator which returns 'page'
 
         It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package
         origins.
 
         The http api call get "{base_url}package-names" to retrieve a sorted list
         of all package names.
 
         There is only one page that list all origins url based on "{base_url}packages/{pkgname}"
         """
         response = self.http_request(
             url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url)
         )
         yield response.json()["packages"]
 
     def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
 
         for pkgname in page:
             package_info_url = self.PACKAGE_INFO_URL_PATTERN.format(
                 base_url=self.url, pkgname=pkgname
             )
             try:
                 response = self.http_request(url=package_info_url)
             except HTTPError:
                 logger.warning(
                     "Failed to fetch metadata for package %s, skipping it from listing.",
                     pkgname,
                 )
                 continue
             package_metadata = response.json()
             package_versions = package_metadata["versions"]
             last_published = max(
                 package_version["published"] for package_version in package_versions
             )
             origin_url = self.ORIGIN_URL_PATTERN.format(
                 base_url=self.url, pkgname=pkgname
             )
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=origin_url,
                 last_update=iso8601.parse_date(last_published),
             )
diff --git a/swh/lister/puppet/__init__.py b/swh/lister/puppet/__init__.py
index 3e5e28d..5d8e6a3 100644
--- a/swh/lister/puppet/__init__.py
+++ b/swh/lister/puppet/__init__.py
@@ -1,108 +1,112 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 """
 Puppet lister
 =============
 
 The Puppet lister list origins from `Puppet Forge`_.
 Puppet Forge is a package manager for Puppet modules.
 
 As of September 2022 `Puppet Forge`_ list 6917 package names.
 
 Origins retrieving strategy
 ---------------------------
 
 To get a list of all package names we call an `http api endpoint`_  which have a
 `getModules`_ endpoint.
 It returns a paginated list of results and a `next` url.
 
 The api follow `OpenApi 3.0 specifications`.
 
+The lister is incremental using ``with_release_since`` api argument whose value is an
+iso date set regarding the last time the lister has been executed, stored as
+``lister.state.last_listing_date``.
+
 Page listing
 ------------
 
 Each page returns a list of ``results`` which are raw data from api response.
 The results size is 100 as 100 is the maximum limit size allowed by the api.
 
 Origins from page
 -----------------
 
 The lister yields one hundred origin url per page.
 
 Origin url is the html page corresponding to a package name on the forge, following
 this pattern::
 
     "https://forge.puppet.com/modules/{owner}/{pkgname}"
 
 For each origin `last_update` is set via the module "updated_at" value.
 As the api also returns all existing versions for a package, we build an `artifacts`
 dict in `extra_loader_arguments` with the archive tarball corresponding to each
 existing versions.
 
 Example for ``file_concat`` module located at
 https://forge.puppet.com/modules/electrical/file_concat::
 
     {
         "artifacts": [
             {
                 "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.1.tar.gz",  # noqa: B950
                 "version": "1.0.1",
                 "filename": "electrical-file_concat-1.0.1.tar.gz",
                 "last_update": "2015-04-17T01:03:46-07:00",
                 "checksums": {
                     "md5": "74901a89544134478c2dfde5efbb7f14",
                     "sha256": "15e973613ea038d8a4f60bafe2d678f88f53f3624c02df3157c0043f4a400de6",  # noqa: B950
                 },
             },
             {
                 "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.0.tar.gz",  # noqa: B950
                 "version": "1.0.0",
                 "filename": "electrical-file_concat-1.0.0.tar.gz",
                 "last_update": "2015-04-09T12:03:13-07:00",
                 "checksums": {
                     "length": 13289,
                 },
             },
         ],
     }
 
 Running tests
 -------------
 
 Activate the virtualenv and run from within swh-lister directory::
 
    pytest -s -vv --log-cli-level=DEBUG swh/lister/puppet/tests
 
 Testing with Docker
 -------------------
 
 Change directory to swh/docker then launch the docker environment::
 
    docker compose up -d
 
 Then schedule a Puppet listing task::
 
    docker compose exec swh-scheduler swh scheduler task add -p oneshot list-puppet
 
 You can follow lister execution by displaying logs of swh-lister service::
 
    docker compose logs -f swh-lister
 
 .. _Puppet Forge: https://forge.puppet.com/
 .. _http api endpoint: https://forgeapi.puppet.com/
 .. _getModules: https://forgeapi.puppet.com/#tag/Module-Operations/operation/getModules
 
 """
 
 
 def register():
     from .lister import PuppetLister
 
     return {
         "lister": PuppetLister,
         "task_modules": ["%s.tasks" % __name__],
     }
diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py
index 80ac3da..6e84b27 100644
--- a/swh/lister/puppet/lister.py
+++ b/swh/lister/puppet/lister.py
@@ -1,113 +1,161 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-from datetime import datetime
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
 import logging
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin
 
+import iso8601
+
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
 PuppetListerPage = List[Dict[str, Any]]
 
 
-class PuppetLister(StatelessLister[PuppetListerPage]):
+@dataclass
+class PuppetListerState:
+    """Store lister state for incremental mode operations"""
+
+    last_listing_date: Optional[datetime] = None
+    """Last date when Puppet lister was executed"""
+
+
+class PuppetLister(Lister[PuppetListerState, PuppetListerPage]):
     """The Puppet lister list origins from 'Puppet Forge'"""
 
     LISTER_NAME = "puppet"
     VISIT_TYPE = "puppet"
     INSTANCE = "puppet"
 
     BASE_URL = "https://forgeapi.puppet.com/"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.BASE_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
+        # Store the datetime the lister runs for incremental purpose
+        self.listing_date = datetime.now()
+
+    def state_from_dict(self, d: Dict[str, Any]) -> PuppetListerState:
+        last_listing_date = d.get("last_listing_date")
+        if last_listing_date is not None:
+            d["last_listing_date"] = iso8601.parse_date(last_listing_date)
+        return PuppetListerState(**d)
+
+    def state_to_dict(self, state: PuppetListerState) -> Dict[str, Any]:
+        d: Dict[str, Optional[str]] = {"last_listing_date": None}
+        last_listing_date = state.last_listing_date
+        if last_listing_date is not None:
+            d["last_listing_date"] = last_listing_date.isoformat()
+        return d
 
     def get_pages(self) -> Iterator[PuppetListerPage]:
         """Yield an iterator which returns 'page'
 
         It request the http api endpoint to get a paginated results of modules,
         and retrieve a `next` url. It ends when `next` json value is `null`.
 
         Open Api specification for getModules endpoint:
         https://forgeapi.puppet.com/#tag/Module-Operations/operation/getModules
 
         """
         # limit = 100 is the max value for pagination
         limit: int = 100
-        response = self.http_request(
-            f"{self.BASE_URL}v3/modules", params={"limit": limit}
-        )
+        params: Dict[str, Any] = {"limit": limit}
+
+        if self.state.last_listing_date:
+            # Incremental mode filter query
+            # To ensure we don't miss records between two lister runs `last_str`` must be
+            # set with an offset of -15 hours, which is the lower timezone recorded in the
+            # tzdb
+            last_str = (
+                self.state.last_listing_date.astimezone(timezone(timedelta(hours=-15)))
+                .date()
+                .isoformat()
+            )
+            params["with_release_since"] = last_str
+
+        response = self.http_request(f"{self.BASE_URL}v3/modules", params=params)
         data: Dict[str, Any] = response.json()
         yield data["results"]
 
         while data["pagination"]["next"]:
             response = self.http_request(
                 urljoin(self.BASE_URL, data["pagination"]["next"])
             )
             data = response.json()
             yield data["results"]
 
     def get_origins_from_page(self, page: PuppetListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances."""
         assert self.lister_obj.id is not None
 
         dt_parse_pattern = "%Y-%m-%d %H:%M:%S %z"
 
         for entry in page:
             last_update = datetime.strptime(entry["updated_at"], dt_parse_pattern)
             pkgname = entry["name"]
             owner = entry["owner"]["slug"]
             url = f"https://forge.puppet.com/modules/{owner}/{pkgname}"
             artifacts = []
             for release in entry["releases"]:
                 # Build an artifact entry following original-artifacts-json specification
                 # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json  # noqa: B950
                 checksums = {}
 
                 if release["version"] == entry["current_release"]["version"]:
                     # checksums are only available for current release
                     for checksum in ("md5", "sha256"):
                         checksums[checksum] = entry["current_release"][
                             f"file_{checksum}"
                         ]
                 else:
                     # use file length as basic content check instead
                     checksums["length"] = release["file_size"]
 
                 artifacts.append(
                     {
                         "filename": release["file_uri"].split("/")[-1],
                         "url": urljoin(self.BASE_URL, release["file_uri"]),
                         "version": release["version"],
                         "last_update": datetime.strptime(
                             release["created_at"], dt_parse_pattern
                         ).isoformat(),
                         "checksums": checksums,
                     }
                 )
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=self.VISIT_TYPE,
                 url=url,
                 last_update=last_update,
                 extra_loader_arguments={"artifacts": artifacts},
             )
+
+    def finalize(self) -> None:
+        self.state.last_listing_date = self.listing_date
+        self.updated = True
diff --git a/swh/lister/puppet/tests/data/https_forgeapi.puppet.com/v3_modules,limit=100,with_release_since=2022-09-26 b/swh/lister/puppet/tests/data/https_forgeapi.puppet.com/v3_modules,limit=100,with_release_since=2022-09-26
new file mode 100644
index 0000000..8bab26c
--- /dev/null
+++ b/swh/lister/puppet/tests/data/https_forgeapi.puppet.com/v3_modules,limit=100,with_release_since=2022-09-26
@@ -0,0 +1,286 @@
+{
+  "pagination": {
+    "limit": 100,
+    "offset": 0,
+    "first": "/v3/modules?limit=100&with_release_since=2022-09-26&offset=0",
+    "previous": null,
+    "current": "/v3/modules?limit=100&with_release_since=2022-09-26&offset=0",
+    "next": null,
+    "total": 1
+  },
+  "results": [
+    {
+      "uri": "/v3/modules/puppet-nftables",
+      "slug": "puppet-nftables",
+      "name": "nftables",
+      "downloads": 8031,
+      "created_at": "2020-12-15 02:15:22 -0800",
+      "updated_at": "2022-10-25 10:43:50 -0700",
+      "deprecated_at": null,
+      "deprecated_for": null,
+      "superseded_by": null,
+      "supported": false,
+      "endorsement": null,
+      "module_group": "base",
+      "owner": {
+        "uri": "/v3/users/puppet",
+        "slug": "puppet",
+        "username": "puppet",
+        "gravatar_id": "e700f5900e0f795fc6811516b475345a"
+      },
+      "premium": false,
+      "current_release": {
+        "uri": "/v3/releases/puppet-nftables-2.6.0",
+        "slug": "puppet-nftables-2.6.0",
+        "module": {
+          "uri": "/v3/modules/puppet-nftables",
+          "slug": "puppet-nftables",
+          "name": "nftables",
+          "deprecated_at": null,
+          "owner": {
+            "uri": "/v3/users/puppet",
+            "slug": "puppet",
+            "username": "puppet",
+            "gravatar_id": "e700f5900e0f795fc6811516b475345a"
+          }
+        },
+        "version": "2.6.0",
+        "metadata": {
+          "name": "puppet-nftables",
+          "version": "2.6.0",
+          "author": "Vox Pupuli",
+          "summary": "Puppet nftables module",
+          "license": "Apache-2.0",
+          "tags": [
+            "firewall",
+            "security",
+            "nftables",
+            "iptables"
+          ],
+          "source": "https://github.com/voxpupuli/puppet-nftables.git",
+          "project_page": "https://github.com/voxpupuli/puppet-nftables",
+          "issues_url": "https://github.com/voxpupuli/puppet-nftables/issues",
+          "dependencies": [
+            {
+              "name": "puppetlabs/concat",
+              "version_requirement": ">= 6.2.0 < 8.0.0"
+            },
+            {
+              "name": "puppet/systemd",
+              "version_requirement": ">= 2.0.0 < 4.0.0"
+            },
+            {
+              "name": "puppetlabs/stdlib",
+              "version_requirement": ">= 4.13.1 < 9.0.0"
+            }
+          ],
+          "operatingsystem_support": [
+            {
+              "operatingsystem": "CentOS",
+              "operatingsystemrelease": [
+                "8",
+                "9"
+              ]
+            },
+            {
+              "operatingsystem": "OracleLinux",
+              "operatingsystemrelease": [
+                "8",
+                "9"
+              ]
+            },
+            {
+              "operatingsystem": "RedHat",
+              "operatingsystemrelease": [
+                "8",
+                "9"
+              ]
+            },
+            {
+              "operatingsystem": "Archlinux"
+            },
+            {
+              "operatingsystem": "Debian",
+              "operatingsystemrelease": [
+                "11"
+              ]
+            },
+            {
+              "operatingsystem": "Ubuntu",
+              "operatingsystemrelease": [
+                "20.04",
+                "22.04"
+              ]
+            }
+          ],
+          "requirements": [
+            {
+              "name": "puppet",
+              "version_requirement": ">= 6.1.0 < 8.0.0"
+            }
+          ]
+        },
+        "tags": [
+          "iptables",
+          "security",
+          "firewall",
+          "nftables"
+        ],
+        "supported": false,
+        "pdk": false,
+        "validation_score": 100,
+        "file_uri": "/v3/files/puppet-nftables-2.6.0.tar.gz",
+        "file_size": 36251,
+        "file_md5": "563dcb90a8e9ea91ff1968452824f3aa",
+        "file_sha256": "53bcbd308220cfbcef298a12f736656c594179d0e4035564c9564f4e721dfff6",
+        "downloads": 65,
+        "readme": "# nftables puppet module\n\n[![Puppet Forge](https://img.shields.io/puppetforge/v/puppet/nftables.svg)](https://forge.puppetlabs.com/puppet/nftables)\n[![Puppet Forge - downloads](https://img.shields.io/puppetforge/dt/puppet/nftables.svg)](https://forge.puppetlabs.com/puppet/nftables)\n[![puppetmodule.info docs](http://www.puppetmodule.info/images/badge.png)](http://www.puppetmodule.info/m/puppet-nftables)\n[![Apache-2.0 License](https://img.shields.io/github/license/voxpupuli/puppet-nftables.svg)](LICENSE)\n\nThis module manages an opinionated nftables configuration.\n\nBy default it sets up a firewall that drops every incoming\nand outgoing connection.\n\nIt only allows outgoing dns, ntp and web and ingoing ssh\ntraffic, although this can be overridden using parameters.\n\nThe config file has a inet filter and a ip nat table setup.\n\nAdditionally, the module comes with a basic infrastructure\nto hook into different places.\n\n## Configuration\n\nThe main configuration file loaded by the nftables service\nwill be `files/config/puppet.nft`, all other files created\nby that module go into `files/config/puppet` and will also\nbe purged if not managed anymore.\n\nThe main configuration file includes dedicated files for\nthe filter and nat tables, as well as processes any\n`custom-*.nft` files before hand.\n\nThe filter and NAT tables both have all the master chains\n(INPUT, OUTPUT, FORWARD in case of filter and PREROUTING\nand POSTROUTING in case of NAT) configured, to which you\ncan hook in your own chains that can contain specific\nrules.\n\nAll filter masterchains drop by default.\nBy default we have a set of default_MASTERCHAIN chains\nconfigured to which you can easily add your custom rules.\n\nFor specific needs you can add your own chain.\n\nThere is a global chain, that defines the default behavior\nfor all masterchains. This chain is empty by default.\n\nINPUT and OUTPUT to the loopback device is allowed by\ndefault, though you could restrict it later.\n\nOn the other hand, if you don't want any of the default tables, chains\nand rules created by the module, you can set `nftables::inet_filter`\nand/or `nftables::nat` to `false` and build your whole nftables\nconfiguration from scratch by using the building blocks provided by\nthis module. Looking at `nftables::inet_filter` for inspiration might\nbe a good idea.\n\n## Rules Validation\n\nInitially puppet deploys all configuration to\n`/etc/nftables/puppet-preflight/` and\n`/etc/nftables/puppet-preflight.nft`. This is validated with\n`nfc -c -L /etc/nftables/puppet-preflight/ -f /etc/nftables/puppet-preflight.nft`.\nIf and only if successful the configuration will be copied to\nthe real locations before the service is reloaded.\n\n## Basic types\n\n### nftables::config\n\nManages a raw file in `/etc/nftables/puppet/${name}.nft`\n\nUse this for any custom table files.\n\n### nftables::chain\n\nPrepares a chain file as a `concat` file to which you will\nbe able to add dedicated rules through `nftables::rule`.\n\nThe name must be unique for all chains. The inject\nparameter can be used to directly add a jump to a\nmasterchain. inject must follow the pattern\n`ORDER-MASTERCHAIN`, where order references a 2-digit\nnumber which defines the rule order (by default use e.g. 20)\nand masterchain references the chain to hook in the new\nchain. It's possible to specify the in-interface name and\nout-interface name for the inject rule.\n\n### nftables::rule\n\nA simple way to add rules to any chain. The name must be:\n`CHAIN_NAME-rulename`, where CHAIN_NAME refers to your\nchain and an arbitrary name for your rule.\nThe rule will be a `concat::fragment` to the chain\n`CHAIN_NAME`.\n\nYou can define the order by using the `order` param.\n\nBefore defining your own rule, take a look to the list of ready-to-use rules\navailable in the\n[REFERENCE](https://github.com/voxpupuli/puppet-nftables/blob/master/REFERENCE.md),\nsomebody might have encapsulated a rule definition for you already.\n\n### nftables::set\n\nAdds a named set to a given table. It allows composing the\nset using individual parameters but also takes raw input\nvia the content and source parameters.\n\n### nftables::simplerule\n\nAllows expressing firewall rules without having to use nftables's language by\nadding an abstraction layer a-la-Firewall. It's rather limited how far you can\ngo so if you need rather complex rules or you can speak nftables it's\nrecommended to use `nftables::rule` directly.\n\n## Facts\n\nOne structured fact `nftables` is available\n\n```\n{\n  tables => [\n    \"bridge-filter\",\n    \"bridge-nat\",\n    \"inet-firewalld\",\n    \"ip-firewalld\",\n    \"ip6-firewalld\"\n  ],\n  version => \"0.9.3\"\n}\n```\n\n* `nftables.version` is the version of the nft command from `nft --version`.\n* `nftables.tables` is the list of tables installed on the machine from `nft list tables`.\n\n## Editor goodies\n\nIf you're using Emacs there are some snippets for\n[Yasnippet](https://github.com/joaotavora/yasnippet) available\n[here](https://github.com/nbarrientos/dotfiles/tree/master/.emacs.d/snippets/puppet-mode)\nthat could make your life easier when using the module. This is third\nparty configuration that's only included here for reference so changes\nin the interfaces exposed by this module are not guaranteed to be\nautomatically applied there.\n",
+        "changelog": "# Changelog\n\nAll notable changes to this project will be documented in this file.\nEach new release typically also includes the latest modulesync defaults.\nThese should not affect the functionality of the module.\n\n## [v2.6.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.6.0) (2022-10-25)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.5.0...v2.6.0)\n\n**Implemented enhancements:**\n\n- Add class for outgoing HKP firewalling [\\#153](https://github.com/voxpupuli/puppet-nftables/pull/153) ([bastelfreak](https://github.com/bastelfreak))\n- Add Ubuntu support [\\#152](https://github.com/voxpupuli/puppet-nftables/pull/152) ([bastelfreak](https://github.com/bastelfreak))\n- split conntrack management into dedicated classes  [\\#148](https://github.com/voxpupuli/puppet-nftables/pull/148) ([duritong](https://github.com/duritong))\n- New nftables::file type to include raw file [\\#147](https://github.com/voxpupuli/puppet-nftables/pull/147) ([traylenator](https://github.com/traylenator))\n\n**Closed issues:**\n\n- Add ability to include completely raw files [\\#146](https://github.com/voxpupuli/puppet-nftables/issues/146)\n- Add support for Debian [\\#65](https://github.com/voxpupuli/puppet-nftables/issues/65)\n\n## [v2.5.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.5.0) (2022-08-26)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.4.0...v2.5.0)\n\n**Implemented enhancements:**\n\n- Add all nftables families as a valid noflush pattern [\\#142](https://github.com/voxpupuli/puppet-nftables/pull/142) ([luisfdez](https://github.com/luisfdez))\n\n**Fixed bugs:**\n\n- Properly escape bridge in rulename [\\#144](https://github.com/voxpupuli/puppet-nftables/pull/144) ([duritong](https://github.com/duritong))\n\n**Closed issues:**\n\n- nftables::bridges creates invalid rule names when bridge devices have multiple IP addresses [\\#143](https://github.com/voxpupuli/puppet-nftables/issues/143)\n\n## [v2.4.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.4.0) (2022-07-11)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.3.0...v2.4.0)\n\n**Implemented enhancements:**\n\n- Add rule to allow outgoing whois queries [\\#140](https://github.com/voxpupuli/puppet-nftables/pull/140) ([bastelfreak](https://github.com/bastelfreak))\n- chrony: Allow filtering for outgoing NTP servers [\\#139](https://github.com/voxpupuli/puppet-nftables/pull/139) ([bastelfreak](https://github.com/bastelfreak))\n- Add class for pxp-agent firewalling [\\#138](https://github.com/voxpupuli/puppet-nftables/pull/138) ([bastelfreak](https://github.com/bastelfreak))\n\n## [v2.3.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.3.0) (2022-07-06)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.2.1...v2.3.0)\n\n**Implemented enhancements:**\n\n- systemctl: Use relative path [\\#136](https://github.com/voxpupuli/puppet-nftables/pull/136) ([bastelfreak](https://github.com/bastelfreak))\n- Add Debian support [\\#134](https://github.com/voxpupuli/puppet-nftables/pull/134) ([bastelfreak](https://github.com/bastelfreak))\n- make path to echo configureable [\\#133](https://github.com/voxpupuli/puppet-nftables/pull/133) ([bastelfreak](https://github.com/bastelfreak))\n- make path to `nft` binary configureable [\\#132](https://github.com/voxpupuli/puppet-nftables/pull/132) ([bastelfreak](https://github.com/bastelfreak))\n\n## [v2.2.1](https://github.com/voxpupuli/puppet-nftables/tree/v2.2.1) (2022-05-02)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.2.0...v2.2.1)\n\n**Merged pull requests:**\n\n- rspec mock systemd process on docker [\\#128](https://github.com/voxpupuli/puppet-nftables/pull/128) ([traylenator](https://github.com/traylenator))\n\n## [v2.2.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.2.0) (2022-02-27)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.1.0...v2.2.0)\n\n**Implemented enhancements:**\n\n- Add support for Arch Linux [\\#124](https://github.com/voxpupuli/puppet-nftables/pull/124) ([hashworks](https://github.com/hashworks))\n- Declare support for RHEL9, CentOS9 and OL9 [\\#120](https://github.com/voxpupuli/puppet-nftables/pull/120) ([nbarrientos](https://github.com/nbarrientos))\n- Rubocop corrections for rubocop 1.22.3 [\\#118](https://github.com/voxpupuli/puppet-nftables/pull/118) ([traylenator](https://github.com/traylenator))\n- Use protocol number instead of label [\\#112](https://github.com/voxpupuli/puppet-nftables/pull/112) ([keachi](https://github.com/keachi))\n\n**Fixed bugs:**\n\n- Ensure that nftables.service remains active after it exits [\\#125](https://github.com/voxpupuli/puppet-nftables/pull/125) ([hashworks](https://github.com/hashworks))\n\n**Merged pull requests:**\n\n- Fix typos in initial reference examples [\\#122](https://github.com/voxpupuli/puppet-nftables/pull/122) ([hashworks](https://github.com/hashworks))\n\n## [v2.1.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.1.0) (2021-09-14)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v2.0.0...v2.1.0)\n\n**Implemented enhancements:**\n\n- nftables::set can only be assigned to 1 table [\\#100](https://github.com/voxpupuli/puppet-nftables/issues/100)\n- support a different table name for 'nat' [\\#107](https://github.com/voxpupuli/puppet-nftables/pull/107) ([figless](https://github.com/figless))\n- Allow declaring the same set in several tables [\\#102](https://github.com/voxpupuli/puppet-nftables/pull/102) ([nbarrientos](https://github.com/nbarrientos))\n\n**Fixed bugs:**\n\n- fix datatype for $table and $dport [\\#104](https://github.com/voxpupuli/puppet-nftables/pull/104) ([bastelfreak](https://github.com/bastelfreak))\n\n**Merged pull requests:**\n\n- Allow stdlib 8.0.0 [\\#106](https://github.com/voxpupuli/puppet-nftables/pull/106) ([smortex](https://github.com/smortex))\n- switch from camptocamp/systemd to voxpupuli/systemd [\\#103](https://github.com/voxpupuli/puppet-nftables/pull/103) ([bastelfreak](https://github.com/bastelfreak))\n- pull fixtures from git and not forge [\\#99](https://github.com/voxpupuli/puppet-nftables/pull/99) ([bastelfreak](https://github.com/bastelfreak))\n\n## [v2.0.0](https://github.com/voxpupuli/puppet-nftables/tree/v2.0.0) (2021-06-03)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v1.3.0...v2.0.0)\n\n**Breaking changes:**\n\n- Drop Puppet 5, puppetlabs/concat 7.x, puppetlabs/stdlib 7.x, camptocamp/systemd: 3.x [\\#92](https://github.com/voxpupuli/puppet-nftables/pull/92) ([traylenator](https://github.com/traylenator))\n- Drop Puppet 5 support [\\#79](https://github.com/voxpupuli/puppet-nftables/pull/79) ([kenyon](https://github.com/kenyon))\n\n**Implemented enhancements:**\n\n- Ability to set base chains [\\#95](https://github.com/voxpupuli/puppet-nftables/issues/95)\n- puppetlabs/concat: Allow 7.x [\\#91](https://github.com/voxpupuli/puppet-nftables/pull/91) ([bastelfreak](https://github.com/bastelfreak))\n- puppetlabs/stdlib: Allow 7.x [\\#90](https://github.com/voxpupuli/puppet-nftables/pull/90) ([bastelfreak](https://github.com/bastelfreak))\n- camptocamp/systemd: allow 3.x [\\#89](https://github.com/voxpupuli/puppet-nftables/pull/89) ([bastelfreak](https://github.com/bastelfreak))\n\n**Fixed bugs:**\n\n- Fix IPv4 source address type detection [\\#93](https://github.com/voxpupuli/puppet-nftables/pull/93) ([nbarrientos](https://github.com/nbarrientos))\n\n**Closed issues:**\n\n- Class\\[Nftables::Bridges\\]\\['bridgenames'\\] contains a Regexp value. It will be converted to the String '/^br.+/' [\\#83](https://github.com/voxpupuli/puppet-nftables/issues/83)\n\n**Merged pull requests:**\n\n- Allow creating a totally empty firewall [\\#96](https://github.com/voxpupuli/puppet-nftables/pull/96) ([nbarrientos](https://github.com/nbarrientos))\n- Amend link to Yasnippets [\\#88](https://github.com/voxpupuli/puppet-nftables/pull/88) ([nbarrientos](https://github.com/nbarrientos))\n\n## [v1.3.0](https://github.com/voxpupuli/puppet-nftables/tree/v1.3.0) (2021-03-25)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v1.2.0...v1.3.0)\n\n**Implemented enhancements:**\n\n- Add rules for QEMU/libvirt guests \\(bridged virtual networking\\) [\\#85](https://github.com/voxpupuli/puppet-nftables/pull/85) ([nbarrientos](https://github.com/nbarrientos))\n- Add nftables.version to structured fact. [\\#84](https://github.com/voxpupuli/puppet-nftables/pull/84) ([traylenator](https://github.com/traylenator))\n- Add rules for Apache ActiveMQ [\\#82](https://github.com/voxpupuli/puppet-nftables/pull/82) ([nbarrientos](https://github.com/nbarrientos))\n- Add Docker-CE default rules [\\#80](https://github.com/voxpupuli/puppet-nftables/pull/80) ([luisfdez](https://github.com/luisfdez))\n\n**Closed issues:**\n\n- Increase puppetlabs/concat version in metadata [\\#78](https://github.com/voxpupuli/puppet-nftables/issues/78)\n\n**Merged pull requests:**\n\n- Fix sections and add a pointer to code snippets for Emacs [\\#81](https://github.com/voxpupuli/puppet-nftables/pull/81) ([nbarrientos](https://github.com/nbarrientos))\n\n## [v1.2.0](https://github.com/voxpupuli/puppet-nftables/tree/v1.2.0) (2021-03-03)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v1.1.1...v1.2.0)\n\n**Implemented enhancements:**\n\n- start declaring the 'global' chain with module resources [\\#73](https://github.com/voxpupuli/puppet-nftables/pull/73) ([lelutin](https://github.com/lelutin))\n\n**Fixed bugs:**\n\n- nftables service is broken after reboot [\\#74](https://github.com/voxpupuli/puppet-nftables/issues/74)\n- fix \\#74 - ensure table are initialized before flushing them [\\#75](https://github.com/voxpupuli/puppet-nftables/pull/75) ([duritong](https://github.com/duritong))\n\n## [v1.1.1](https://github.com/voxpupuli/puppet-nftables/tree/v1.1.1) (2021-01-29)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v1.1.0...v1.1.1)\n\n**Fixed bugs:**\n\n- Simplerule: wrong IP protocol version filter statement for IPv6 traffic [\\#69](https://github.com/voxpupuli/puppet-nftables/issues/69)\n- Fix IP version filter for IPv6 traffic [\\#70](https://github.com/voxpupuli/puppet-nftables/pull/70) ([nbarrientos](https://github.com/nbarrientos))\n\n**Merged pull requests:**\n\n- Improve nftables::rule's documentation [\\#68](https://github.com/voxpupuli/puppet-nftables/pull/68) ([nbarrientos](https://github.com/nbarrientos))\n\n## [v1.1.0](https://github.com/voxpupuli/puppet-nftables/tree/v1.1.0) (2021-01-25)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/v1.0.0...v1.1.0)\n\n**Implemented enhancements:**\n\n- Enable parameter\\_documentation lint [\\#64](https://github.com/voxpupuli/puppet-nftables/pull/64) ([traylenator](https://github.com/traylenator))\n- Add Samba in rules [\\#62](https://github.com/voxpupuli/puppet-nftables/pull/62) ([glpatcern](https://github.com/glpatcern))\n- Add some mail related outgoing rules [\\#60](https://github.com/voxpupuli/puppet-nftables/pull/60) ([duritong](https://github.com/duritong))\n\n**Fixed bugs:**\n\n- nftables::simplerule should follow the same rules as nftables::rule [\\#58](https://github.com/voxpupuli/puppet-nftables/issues/58)\n- Align simplerule and rule rulename requirements [\\#59](https://github.com/voxpupuli/puppet-nftables/pull/59) ([nbarrientos](https://github.com/nbarrientos))\n\n**Closed issues:**\n\n- Get it under the voxpupuli umbrella [\\#35](https://github.com/voxpupuli/puppet-nftables/issues/35)\n\n**Merged pull requests:**\n\n- Add badges to README [\\#63](https://github.com/voxpupuli/puppet-nftables/pull/63) ([traylenator](https://github.com/traylenator))\n- Check that all the predefined rules are declared in the all rules acceptance test [\\#53](https://github.com/voxpupuli/puppet-nftables/pull/53) ([nbarrientos](https://github.com/nbarrientos))\n\n## [v1.0.0](https://github.com/voxpupuli/puppet-nftables/tree/v1.0.0) (2020-12-15)\n\n[Full Changelog](https://github.com/voxpupuli/puppet-nftables/compare/0ba57c66a35ed4e9b570d8a6315a33a1c4ba3181...v1.0.0)\n\n**Breaking changes:**\n\n- switch the server naming [\\#42](https://github.com/voxpupuli/puppet-nftables/pull/42) ([duritong](https://github.com/duritong))\n\n**Implemented enhancements:**\n\n- Use Stdlib::Port everywhere in place of Integer [\\#56](https://github.com/voxpupuli/puppet-nftables/pull/56) ([traylenator](https://github.com/traylenator))\n- Enable Puppet 7 support [\\#51](https://github.com/voxpupuli/puppet-nftables/pull/51) ([bastelfreak](https://github.com/bastelfreak))\n- Several fixes for nftables::config [\\#48](https://github.com/voxpupuli/puppet-nftables/pull/48) ([nbarrientos](https://github.com/nbarrientos))\n- rubocop corrections  [\\#41](https://github.com/voxpupuli/puppet-nftables/pull/41) ([traylenator](https://github.com/traylenator))\n- Add basic configuration validation acceptance test [\\#38](https://github.com/voxpupuli/puppet-nftables/pull/38) ([traylenator](https://github.com/traylenator))\n- Remove duplicate flush on reload [\\#34](https://github.com/voxpupuli/puppet-nftables/pull/34) ([traylenator](https://github.com/traylenator))\n- Add nftables::simplerule [\\#33](https://github.com/voxpupuli/puppet-nftables/pull/33) ([nbarrientos](https://github.com/nbarrientos))\n- Add Ceph and NFS rules [\\#32](https://github.com/voxpupuli/puppet-nftables/pull/32) ([dvanders](https://github.com/dvanders))\n- New parameter noflush\\_tables to selectivly skip flush [\\#31](https://github.com/voxpupuli/puppet-nftables/pull/31) ([traylenator](https://github.com/traylenator))\n- Scientific Linux 8 will never exist [\\#30](https://github.com/voxpupuli/puppet-nftables/pull/30) ([traylenator](https://github.com/traylenator))\n- Enable conntrack in FORWARD [\\#29](https://github.com/voxpupuli/puppet-nftables/pull/29) ([keachi](https://github.com/keachi))\n- Do not test nftables::rules repeatadly [\\#28](https://github.com/voxpupuli/puppet-nftables/pull/28) ([traylenator](https://github.com/traylenator))\n- Allow sourcing sets from Hiera [\\#26](https://github.com/voxpupuli/puppet-nftables/pull/26) ([nbarrientos](https://github.com/nbarrientos))\n- Allow disabling default NAT tables and chains [\\#25](https://github.com/voxpupuli/puppet-nftables/pull/25) ([nbarrientos](https://github.com/nbarrientos))\n- Set a customisable rate limit to the logging rules [\\#22](https://github.com/voxpupuli/puppet-nftables/pull/22) ([nbarrientos](https://github.com/nbarrientos))\n- Make masking Service\\['firewalld'\\] optional [\\#20](https://github.com/voxpupuli/puppet-nftables/pull/20) ([nbarrientos](https://github.com/nbarrientos))\n- Move ICMP stuff to separate classes allowing better customisation [\\#16](https://github.com/voxpupuli/puppet-nftables/pull/16) ([nbarrientos](https://github.com/nbarrientos))\n- Move conntrack rules from global to INPUT and OUTPUT [\\#14](https://github.com/voxpupuli/puppet-nftables/pull/14) ([nbarrientos](https://github.com/nbarrientos))\n- Add comments for all the nftable::rules entries [\\#13](https://github.com/voxpupuli/puppet-nftables/pull/13) ([traylenator](https://github.com/traylenator))\n- Allow tables to add comments to $log\\_prefix [\\#12](https://github.com/voxpupuli/puppet-nftables/pull/12) ([nbarrientos](https://github.com/nbarrientos))\n- Reload rules atomically and verify rules before deploy [\\#10](https://github.com/voxpupuli/puppet-nftables/pull/10) ([traylenator](https://github.com/traylenator))\n- Allow raw sets and dashes in set names [\\#8](https://github.com/voxpupuli/puppet-nftables/pull/8) ([nbarrientos](https://github.com/nbarrientos))\n- Add a parameter to control the fate of discarded traffic [\\#7](https://github.com/voxpupuli/puppet-nftables/pull/7) ([nbarrientos](https://github.com/nbarrientos))\n- Add rules for afs3\\_callback in and out rules for kerberos and openafs. [\\#6](https://github.com/voxpupuli/puppet-nftables/pull/6) ([traylenator](https://github.com/traylenator))\n- Allow customising the log prefix [\\#5](https://github.com/voxpupuli/puppet-nftables/pull/5) ([nbarrientos](https://github.com/nbarrientos))\n- Add classes encapsulating rules for DHCPv6 client traffic \\(in/out\\) [\\#4](https://github.com/voxpupuli/puppet-nftables/pull/4) ([nbarrientos](https://github.com/nbarrientos))\n- Add support for named sets [\\#3](https://github.com/voxpupuli/puppet-nftables/pull/3) ([nbarrientos](https://github.com/nbarrientos))\n- New parameter out\\_all, default false [\\#1](https://github.com/voxpupuli/puppet-nftables/pull/1) ([traylenator](https://github.com/traylenator))\n\n**Fixed bugs:**\n\n- Correct nfs3 invalid udp /tcp matching rule and more tests [\\#50](https://github.com/voxpupuli/puppet-nftables/pull/50) ([traylenator](https://github.com/traylenator))\n- Prefix custom tables with custom- so they're loaded [\\#47](https://github.com/voxpupuli/puppet-nftables/pull/47) ([nbarrientos](https://github.com/nbarrientos))\n- Correct bad merge [\\#15](https://github.com/voxpupuli/puppet-nftables/pull/15) ([traylenator](https://github.com/traylenator))\n\n**Closed issues:**\n\n- deploying custom tables is broken [\\#45](https://github.com/voxpupuli/puppet-nftables/issues/45)\n- Switch to Stdlib::Port everywhere [\\#37](https://github.com/voxpupuli/puppet-nftables/issues/37)\n- Add set definition from Hiera [\\#24](https://github.com/voxpupuli/puppet-nftables/issues/24)\n- Add an option to disable NAT [\\#23](https://github.com/voxpupuli/puppet-nftables/issues/23)\n- Add an option to limit the rate of logged messages [\\#19](https://github.com/voxpupuli/puppet-nftables/issues/19)\n- Rule API [\\#17](https://github.com/voxpupuli/puppet-nftables/issues/17)\n- Publish to forge.puppet.com [\\#11](https://github.com/voxpupuli/puppet-nftables/issues/11)\n- The global chain contains INPUT specific rules [\\#9](https://github.com/voxpupuli/puppet-nftables/issues/9)\n- The fate of forbidden packets should be configurable [\\#2](https://github.com/voxpupuli/puppet-nftables/issues/2)\n\n**Merged pull requests:**\n\n- Docs for nftables::set [\\#55](https://github.com/voxpupuli/puppet-nftables/pull/55) ([traylenator](https://github.com/traylenator))\n- Remove a blank separating the doc string and the code [\\#52](https://github.com/voxpupuli/puppet-nftables/pull/52) ([nbarrientos](https://github.com/nbarrientos))\n- Release 1.0.0 [\\#49](https://github.com/voxpupuli/puppet-nftables/pull/49) ([traylenator](https://github.com/traylenator))\n- Correct layout of ignore table example [\\#44](https://github.com/voxpupuli/puppet-nftables/pull/44) ([traylenator](https://github.com/traylenator))\n- Fix typos and formatting in the README [\\#43](https://github.com/voxpupuli/puppet-nftables/pull/43) ([nbarrientos](https://github.com/nbarrientos))\n- Comment why firewalld\\_enable parameter is required [\\#40](https://github.com/voxpupuli/puppet-nftables/pull/40) ([traylenator](https://github.com/traylenator))\n- modulesync  4.0.0 [\\#36](https://github.com/voxpupuli/puppet-nftables/pull/36) ([traylenator](https://github.com/traylenator))\n- Refresh REFERENCE [\\#27](https://github.com/voxpupuli/puppet-nftables/pull/27) ([traylenator](https://github.com/traylenator))\n\n\n\n\\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)*\n",
+        "license": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"{}\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2020 immerda / Puppet Modules\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n",
+        "reference": "# Reference\n\n<!-- DO NOT EDIT: This document was generated by Puppet Strings -->\n\n## Table of Contents\n\n### Classes\n\n* [`nftables`](#nftables): Configure nftables\n* [`nftables::bridges`](#nftablesbridges): allow forwarding traffic on bridges\n* [`nftables::inet_filter`](#nftablesinet_filter): manage basic chains in table inet filter\n* [`nftables::inet_filter::fwd_conntrack`](#nftablesinet_filterfwd_conntrack): enable conntrack for fwd\n* [`nftables::inet_filter::in_out_conntrack`](#nftablesinet_filterin_out_conntrack): manage input & output conntrack\n* [`nftables::ip_nat`](#nftablesip_nat): manage basic chains in table ip nat\n* [`nftables::rules::activemq`](#nftablesrulesactivemq): Provides input rules for Apache ActiveMQ\n* [`nftables::rules::afs3_callback`](#nftablesrulesafs3_callback): Open call back port for AFS clients\n* [`nftables::rules::ceph`](#nftablesrulesceph): Ceph is a distributed object store and file system. Enable this to support Ceph's Object Storage Daemons (OSD), Metadata Server Daemons (MDS)\n* [`nftables::rules::ceph_mon`](#nftablesrulesceph_mon): Ceph is a distributed object store and file system.\nEnable this option to support Ceph's Monitor Daemon.\n* [`nftables::rules::dhcpv6_client`](#nftablesrulesdhcpv6_client): allow DHCPv6 requests in to a host\n* [`nftables::rules::dns`](#nftablesrulesdns): manage in dns\n* [`nftables::rules::docker_ce`](#nftablesrulesdocker_ce): Default firewall configuration for Docker-CE\n* [`nftables::rules::http`](#nftablesruleshttp): manage in http\n* [`nftables::rules::https`](#nftablesruleshttps): manage in https\n* [`nftables::rules::icinga2`](#nftablesrulesicinga2): manage in icinga2\n* [`nftables::rules::icmp`](#nftablesrulesicmp)\n* [`nftables::rules::nfs`](#nftablesrulesnfs): manage in nfs4\n* [`nftables::rules::nfs3`](#nftablesrulesnfs3): manage in nfs3\n* [`nftables::rules::node_exporter`](#nftablesrulesnode_exporter): manage in node exporter\n* [`nftables::rules::ospf`](#nftablesrulesospf): manage in ospf\n* [`nftables::rules::ospf3`](#nftablesrulesospf3): manage in ospf3\n* [`nftables::rules::out::all`](#nftablesrulesoutall): allow all outbound\n* [`nftables::rules::out::ceph_client`](#nftablesrulesoutceph_client): Ceph is a distributed object store and file system.\nEnable this to be a client of Ceph's Monitor (MON),\nObject Storage Daemons (OSD), Metadata Server Daemons (MDS),\nand Manager Daemons (MGR).\n* [`nftables::rules::out::chrony`](#nftablesrulesoutchrony): manage out chrony\n* [`nftables::rules::out::dhcp`](#nftablesrulesoutdhcp): manage out dhcp\n* [`nftables::rules::out::dhcpv6_client`](#nftablesrulesoutdhcpv6_client): Allow DHCPv6 requests out of a host\n* [`nftables::rules::out::dns`](#nftablesrulesoutdns): manage out dns\n* [`nftables::rules::out::hkp`](#nftablesrulesouthkp): allow outgoing hkp connections to gpg keyservers\n* [`nftables::rules::out::http`](#nftablesrulesouthttp): manage out http\n* [`nftables::rules::out::https`](#nftablesrulesouthttps): manage out https\n* [`nftables::rules::out::icmp`](#nftablesrulesouticmp): control outbound icmp packages\n* [`nftables::rules::out::imap`](#nftablesrulesoutimap): allow outgoing imap\n* [`nftables::rules::out::kerberos`](#nftablesrulesoutkerberos): allows outbound access for kerberos\n* [`nftables::rules::out::mysql`](#nftablesrulesoutmysql): manage out mysql\n* [`nftables::rules::out::nfs`](#nftablesrulesoutnfs): manage out nfs\n* [`nftables::rules::out::nfs3`](#nftablesrulesoutnfs3): manage out nfs3\n* [`nftables::rules::out::openafs_client`](#nftablesrulesoutopenafs_client): allows outbound access for afs clients\n7000 - afs3-fileserver\n7002 - afs3-ptserver\n7003 - vlserver\n* [`nftables::rules::out::ospf`](#nftablesrulesoutospf): manage out ospf\n* [`nftables::rules::out::ospf3`](#nftablesrulesoutospf3): manage out ospf3\n* [`nftables::rules::out::pop3`](#nftablesrulesoutpop3): allow outgoing pop3\n* [`nftables::rules::out::postgres`](#nftablesrulesoutpostgres): manage out postgres\n* [`nftables::rules::out::puppet`](#nftablesrulesoutpuppet): manage outgoing puppet\n* [`nftables::rules::out::pxp_agent`](#nftablesrulesoutpxp_agent): manage outgoing pxp-agent\n* [`nftables::rules::out::smtp`](#nftablesrulesoutsmtp): allow outgoing smtp\n* [`nftables::rules::out::smtp_client`](#nftablesrulesoutsmtp_client): allow outgoing smtp client\n* [`nftables::rules::out::ssh`](#nftablesrulesoutssh): manage out ssh\n* [`nftables::rules::out::ssh::remove`](#nftablesrulesoutsshremove): disable outgoing ssh\n* [`nftables::rules::out::tor`](#nftablesrulesouttor): manage out tor\n* [`nftables::rules::out::whois`](#nftablesrulesoutwhois): allow clients to query remote whois server\n* [`nftables::rules::out::wireguard`](#nftablesrulesoutwireguard): manage out wireguard\n* [`nftables::rules::puppet`](#nftablesrulespuppet): manage in puppet\n* [`nftables::rules::pxp_agent`](#nftablesrulespxp_agent): manage in pxp-agent\n* [`nftables::rules::qemu`](#nftablesrulesqemu): Bridged network configuration for qemu/libvirt\n* [`nftables::rules::samba`](#nftablesrulessamba): manage Samba, the suite to allow Windows file sharing on Linux resources.\n* [`nftables::rules::smtp`](#nftablesrulessmtp): manage in smtp\n* [`nftables::rules::smtp_submission`](#nftablesrulessmtp_submission): manage in smtp submission\n* [`nftables::rules::smtps`](#nftablesrulessmtps): manage in smtps\n* [`nftables::rules::ssh`](#nftablesrulesssh): manage in ssh\n* [`nftables::rules::tor`](#nftablesrulestor): manage in tor\n* [`nftables::rules::wireguard`](#nftablesruleswireguard): manage in wireguard\n* [`nftables::services::dhcpv6_client`](#nftablesservicesdhcpv6_client): Allow in and outbound traffic for DHCPv6 server\n* [`nftables::services::openafs_client`](#nftablesservicesopenafs_client): Open inbound and outbound ports for an AFS client\n\n### Defined types\n\n* [`nftables::chain`](#nftableschain): manage a chain\n* [`nftables::config`](#nftablesconfig): manage a config snippet\n* [`nftables::file`](#nftablesfile): Insert a file into the nftables configuration\n* [`nftables::rule`](#nftablesrule): Provides an interface to create a firewall rule\n* [`nftables::rules::dnat4`](#nftablesrulesdnat4): manage a ipv4 dnat rule\n* [`nftables::rules::masquerade`](#nftablesrulesmasquerade): masquerade all outgoing traffic\n* [`nftables::rules::snat4`](#nftablesrulessnat4): manage a ipv4 snat rule\n* [`nftables::set`](#nftablesset): manage a named set\n* [`nftables::simplerule`](#nftablessimplerule): Provides a simplified interface to nftables::rule\n\n### Data types\n\n* [`Nftables::Addr`](#nftablesaddr): Represents an address expression to be used within a rule.\n* [`Nftables::Addr::Set`](#nftablesaddrset): Represents a set expression to be used within a rule.\n* [`Nftables::Port`](#nftablesport): Represents a port expression to be used within a rule.\n* [`Nftables::Port::Range`](#nftablesportrange): Represents a port range expression to be used within a rule.\n* [`Nftables::RuleName`](#nftablesrulename): Represents a rule name to be used in a raw rule created via nftables::rule.\nIt's a dash separated string. The first component describes the chain to\nadd the rule to, the second the rule name and the (optional) third a number.\nEx: 'default_in-sshd', 'default_out-my_service-2'.\n* [`Nftables::SimpleRuleName`](#nftablessimplerulename): Represents a simple rule name to be used in a rule created via nftables::simplerule\n\n## Classes\n\n### <a name=\"nftables\"></a>`nftables`\n\nConfigure nftables\n\n#### Examples\n\n##### allow dns out and do not allow ntp out\n\n```puppet\nclass{ 'nftables':\n  out_ntp => false,\n  out_dns => true,\n}\n```\n\n##### do not flush particular tables, fail2ban in this case\n\n```puppet\nclass{ 'nftables':\n  noflush_tables => ['inet-f2b-table'],\n}\n```\n\n#### Parameters\n\nThe following parameters are available in the `nftables` class:\n\n* [`out_all`](#out_all)\n* [`out_ntp`](#out_ntp)\n* [`out_http`](#out_http)\n* [`out_dns`](#out_dns)\n* [`out_https`](#out_https)\n* [`out_icmp`](#out_icmp)\n* [`in_ssh`](#in_ssh)\n* [`in_icmp`](#in_icmp)\n* [`inet_filter`](#inet_filter)\n* [`nat`](#nat)\n* [`nat_table_name`](#nat_table_name)\n* [`sets`](#sets)\n* [`log_prefix`](#log_prefix)\n* [`log_limit`](#log_limit)\n* [`reject_with`](#reject_with)\n* [`in_out_conntrack`](#in_out_conntrack)\n* [`fwd_conntrack`](#fwd_conntrack)\n* [`firewalld_enable`](#firewalld_enable)\n* [`noflush_tables`](#noflush_tables)\n* [`rules`](#rules)\n* [`configuration_path`](#configuration_path)\n* [`nft_path`](#nft_path)\n* [`echo`](#echo)\n\n##### <a name=\"out_all\"></a>`out_all`\n\nData type: `Boolean`\n\nAllow all outbound connections. If `true` then all other\nout parameters `out_ntp`, `out_dns`, ... will be assuemed\nfalse.\n\nDefault value: ``false``\n\n##### <a name=\"out_ntp\"></a>`out_ntp`\n\nData type: `Boolean`\n\nAllow outbound to ntp servers.\n\nDefault value: ``true``\n\n##### <a name=\"out_http\"></a>`out_http`\n\nData type: `Boolean`\n\nAllow outbound to http servers.\n\nDefault value: ``true``\n\n##### <a name=\"out_dns\"></a>`out_dns`\n\nData type: `Boolean`\n\nAllow outbound to dns servers.\n\nDefault value: ``true``\n\n##### <a name=\"out_https\"></a>`out_https`\n\nData type: `Boolean`\n\nAllow outbound to https servers.\n\nDefault value: ``true``\n\n##### <a name=\"out_icmp\"></a>`out_icmp`\n\nData type: `Boolean`\n\nAllow outbound ICMPv4/v6 traffic.\n\nDefault value: ``true``\n\n##### <a name=\"in_ssh\"></a>`in_ssh`\n\nData type: `Boolean`\n\nAllow inbound to ssh servers.\n\nDefault value: ``true``\n\n##### <a name=\"in_icmp\"></a>`in_icmp`\n\nData type: `Boolean`\n\nAllow inbound ICMPv4/v6 traffic.\n\nDefault value: ``true``\n\n##### <a name=\"inet_filter\"></a>`inet_filter`\n\nData type: `Boolean`\n\nAdd default tables, chains and rules to process traffic.\n\nDefault value: ``true``\n\n##### <a name=\"nat\"></a>`nat`\n\nData type: `Boolean`\n\nAdd default tables and chains to process NAT traffic.\n\nDefault value: ``true``\n\n##### <a name=\"nat_table_name\"></a>`nat_table_name`\n\nData type: `String[1]`\n\nThe name of the 'nat' table.\n\nDefault value: `'nat'`\n\n##### <a name=\"sets\"></a>`sets`\n\nData type: `Hash`\n\nAllows sourcing set definitions directly from Hiera.\n\nDefault value: `{}`\n\n##### <a name=\"log_prefix\"></a>`log_prefix`\n\nData type: `String`\n\nString that will be used as prefix when logging packets. It can contain\ntwo variables using standard sprintf() string-formatting:\n * chain: Will be replaced by the name of the chain.\n * comment: Allows chains to add extra comments.\n\nDefault value: `'[nftables] %<chain>s %<comment>s'`\n\n##### <a name=\"log_limit\"></a>`log_limit`\n\nData type: `Variant[Boolean[false], String]`\n\nString with the content of a limit statement to be applied\nto the rules that log discarded traffic. Set to false to\ndisable rate limiting.\n\nDefault value: `'3/minute burst 5 packets'`\n\n##### <a name=\"reject_with\"></a>`reject_with`\n\nData type: `Variant[Boolean[false], Pattern[/icmp(v6|x)? type .+|tcp reset/]]`\n\nHow to discard packets not matching any rule. If `false`, the\nfate of the packet will be defined by the chain policy (normally\ndrop), otherwise the packet will be rejected with the REJECT_WITH\npolicy indicated by the value of this parameter.\n\nDefault value: `'icmpx type port-unreachable'`\n\n##### <a name=\"in_out_conntrack\"></a>`in_out_conntrack`\n\nData type: `Boolean`\n\nAdds INPUT and OUTPUT rules to allow traffic that's part of an\nestablished connection and also to drop invalid packets.\n\nDefault value: ``true``\n\n##### <a name=\"fwd_conntrack\"></a>`fwd_conntrack`\n\nData type: `Boolean`\n\nAdds FORWARD rules to allow traffic that's part of an\nestablished connection and also to drop invalid packets.\n\nDefault value: ``false``\n\n##### <a name=\"firewalld_enable\"></a>`firewalld_enable`\n\nData type: `Variant[Boolean[false], Enum['mask']]`\n\nConfigures how the firewalld systemd service unit is enabled. It might be\nuseful to set this to false if you're externaly removing firewalld from\nthe system completely.\n\nDefault value: `'mask'`\n\n##### <a name=\"noflush_tables\"></a>`noflush_tables`\n\nData type: `Optional[Array[Pattern[/^(ip|ip6|inet|arp|bridge|netdev)-[-a-zA-Z0-9_]+$/],1]]`\n\nIf specified only other existings tables will be flushed.\nIf left unset all tables will be flushed via a `flush ruleset`\n\nDefault value: ``undef``\n\n##### <a name=\"rules\"></a>`rules`\n\nData type: `Hash`\n\nSpecify hashes of `nftables::rule`s via hiera\n\nDefault value: `{}`\n\n##### <a name=\"configuration_path\"></a>`configuration_path`\n\nData type: `Stdlib::Unixpath`\n\nThe absolute path to the principal nftables configuration file. The default\nvaries depending on the system, and is set in the module's data.\n\n##### <a name=\"nft_path\"></a>`nft_path`\n\nData type: `Stdlib::Unixpath`\n\nPath to the nft binary\n\n##### <a name=\"echo\"></a>`echo`\n\nData type: `Stdlib::Unixpath`\n\nPath to the echo binary\n\n### <a name=\"nftablesbridges\"></a>`nftables::bridges`\n\nallow forwarding traffic on bridges\n\n#### Parameters\n\nThe following parameters are available in the `nftables::bridges` class:\n\n* [`ensure`](#ensure)\n* [`bridgenames`](#bridgenames)\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\n\n\nDefault value: `'present'`\n\n##### <a name=\"bridgenames\"></a>`bridgenames`\n\nData type: `Regexp`\n\n\n\nDefault value: `/^br.+/`\n\n### <a name=\"nftablesinet_filter\"></a>`nftables::inet_filter`\n\nmanage basic chains in table inet filter\n\n### <a name=\"nftablesinet_filterfwd_conntrack\"></a>`nftables::inet_filter::fwd_conntrack`\n\nenable conntrack for fwd\n\n### <a name=\"nftablesinet_filterin_out_conntrack\"></a>`nftables::inet_filter::in_out_conntrack`\n\nmanage input & output conntrack\n\n### <a name=\"nftablesip_nat\"></a>`nftables::ip_nat`\n\nmanage basic chains in table ip nat\n\n### <a name=\"nftablesrulesactivemq\"></a>`nftables::rules::activemq`\n\nProvides input rules for Apache ActiveMQ\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::activemq` class:\n\n* [`tcp`](#tcp)\n* [`udp`](#udp)\n* [`port`](#port)\n\n##### <a name=\"tcp\"></a>`tcp`\n\nData type: `Boolean`\n\nCreate the rule for TCP traffic.\n\nDefault value: ``true``\n\n##### <a name=\"udp\"></a>`udp`\n\nData type: `Boolean`\n\nCreate the rule for UDP traffic.\n\nDefault value: ``true``\n\n##### <a name=\"port\"></a>`port`\n\nData type: `Stdlib::Port`\n\nThe port number for the ActiveMQ daemon.\n\nDefault value: `61616`\n\n### <a name=\"nftablesrulesafs3_callback\"></a>`nftables::rules::afs3_callback`\n\nOpen call back port for AFS clients\n\n#### Examples\n\n##### allow call backs from particular hosts\n\n```puppet\nclass{'nftables::rules::afs3_callback':\n  saddr => ['192.168.0.0/16', '10.0.0.222']\n}\n```\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::afs3_callback` class:\n\n* [`saddr`](#saddr)\n\n##### <a name=\"saddr\"></a>`saddr`\n\nData type: `Array[Stdlib::IP::Address::V4,1]`\n\nlist of source network ranges to a\n\nDefault value: `['0.0.0.0/0']`\n\n### <a name=\"nftablesrulesceph\"></a>`nftables::rules::ceph`\n\nCeph is a distributed object store and file system.\nEnable this to support Ceph's Object Storage Daemons (OSD),\nMetadata Server Daemons (MDS), or Manager Daemons (MGR).\n\n### <a name=\"nftablesrulesceph_mon\"></a>`nftables::rules::ceph_mon`\n\nCeph is a distributed object store and file system.\nEnable this option to support Ceph's Monitor Daemon.\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::ceph_mon` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nspecify ports for ceph service\n\nDefault value: `[3300, 6789]`\n\n### <a name=\"nftablesrulesdhcpv6_client\"></a>`nftables::rules::dhcpv6_client`\n\nallow DHCPv6 requests in to a host\n\n### <a name=\"nftablesrulesdns\"></a>`nftables::rules::dns`\n\nmanage in dns\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::dns` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nSpecify ports for dns.\n\nDefault value: `[53]`\n\n### <a name=\"nftablesrulesdocker_ce\"></a>`nftables::rules::docker_ce`\n\nThe configuration distributed in this class represents the default firewall\nconfiguration done by docker-ce when the iptables integration is enabled.\n\nThis class is needed as the default docker-ce rules added to ip-filter conflict\nwith the inet-filter forward rules set by default in this module.\n\nWhen using this class 'docker::iptables: false' should be set.\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::docker_ce` class:\n\n* [`docker_interface`](#docker_interface)\n* [`docker_prefix`](#docker_prefix)\n* [`manage_docker_chains`](#manage_docker_chains)\n* [`manage_base_chains`](#manage_base_chains)\n\n##### <a name=\"docker_interface\"></a>`docker_interface`\n\nData type: `String[1]`\n\nInterface name used by docker.\n\nDefault value: `'docker0'`\n\n##### <a name=\"docker_prefix\"></a>`docker_prefix`\n\nData type: `Stdlib::IP::Address::V4::CIDR`\n\nThe address space used by docker.\n\nDefault value: `'172.17.0.0/16'`\n\n##### <a name=\"manage_docker_chains\"></a>`manage_docker_chains`\n\nData type: `Boolean`\n\nFlag to control whether the class should create the docker related chains.\n\nDefault value: ``true``\n\n##### <a name=\"manage_base_chains\"></a>`manage_base_chains`\n\nData type: `Boolean`\n\nFlag to control whether the class should create the base common chains.\n\nDefault value: ``true``\n\n### <a name=\"nftablesruleshttp\"></a>`nftables::rules::http`\n\nmanage in http\n\n### <a name=\"nftablesruleshttps\"></a>`nftables::rules::https`\n\nmanage in https\n\n### <a name=\"nftablesrulesicinga2\"></a>`nftables::rules::icinga2`\n\nmanage in icinga2\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::icinga2` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nSpecify ports for icinga1\n\nDefault value: `[5665]`\n\n### <a name=\"nftablesrulesicmp\"></a>`nftables::rules::icmp`\n\nThe nftables::rules::icmp class.\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::icmp` class:\n\n* [`v4_types`](#v4_types)\n* [`v6_types`](#v6_types)\n* [`order`](#order)\n\n##### <a name=\"v4_types\"></a>`v4_types`\n\nData type: `Optional[Array[String]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"v6_types\"></a>`v6_types`\n\nData type: `Optional[Array[String]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"order\"></a>`order`\n\nData type: `String`\n\n\n\nDefault value: `'10'`\n\n### <a name=\"nftablesrulesnfs\"></a>`nftables::rules::nfs`\n\nmanage in nfs4\n\n### <a name=\"nftablesrulesnfs3\"></a>`nftables::rules::nfs3`\n\nmanage in nfs3\n\n### <a name=\"nftablesrulesnode_exporter\"></a>`nftables::rules::node_exporter`\n\nmanage in node exporter\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::node_exporter` class:\n\n* [`prometheus_server`](#prometheus_server)\n* [`port`](#port)\n\n##### <a name=\"prometheus_server\"></a>`prometheus_server`\n\nData type: `Optional[Variant[String,Array[String,1]]]`\n\nSpecify server name\n\nDefault value: ``undef``\n\n##### <a name=\"port\"></a>`port`\n\nData type: `Stdlib::Port`\n\nSpecify port to open\n\nDefault value: `9100`\n\n### <a name=\"nftablesrulesospf\"></a>`nftables::rules::ospf`\n\nmanage in ospf\n\n### <a name=\"nftablesrulesospf3\"></a>`nftables::rules::ospf3`\n\nmanage in ospf3\n\n### <a name=\"nftablesrulesoutall\"></a>`nftables::rules::out::all`\n\nallow all outbound\n\n### <a name=\"nftablesrulesoutceph_client\"></a>`nftables::rules::out::ceph_client`\n\nCeph is a distributed object store and file system.\nEnable this to be a client of Ceph's Monitor (MON),\nObject Storage Daemons (OSD), Metadata Server Daemons (MDS),\nand Manager Daemons (MGR).\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::ceph_client` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nSpecify ports to open\n\nDefault value: `[3300, 6789]`\n\n### <a name=\"nftablesrulesoutchrony\"></a>`nftables::rules::out::chrony`\n\nmanage out chrony\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::chrony` class:\n\n* [`servers`](#servers)\n\n##### <a name=\"servers\"></a>`servers`\n\nData type: `Array[Stdlib::IP::Address]`\n\nsingle IP-Address or array of IP-addresses from NTP servers\n\nDefault value: `[]`\n\n### <a name=\"nftablesrulesoutdhcp\"></a>`nftables::rules::out::dhcp`\n\nmanage out dhcp\n\n### <a name=\"nftablesrulesoutdhcpv6_client\"></a>`nftables::rules::out::dhcpv6_client`\n\nAllow DHCPv6 requests out of a host\n\n### <a name=\"nftablesrulesoutdns\"></a>`nftables::rules::out::dns`\n\nmanage out dns\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::dns` class:\n\n* [`dns_server`](#dns_server)\n\n##### <a name=\"dns_server\"></a>`dns_server`\n\nData type: `Optional[Variant[String,Array[String,1]]]`\n\nspecify dns_server name\n\nDefault value: ``undef``\n\n### <a name=\"nftablesrulesouthkp\"></a>`nftables::rules::out::hkp`\n\nallow outgoing hkp connections to gpg keyservers\n\n### <a name=\"nftablesrulesouthttp\"></a>`nftables::rules::out::http`\n\nmanage out http\n\n### <a name=\"nftablesrulesouthttps\"></a>`nftables::rules::out::https`\n\nmanage out https\n\n### <a name=\"nftablesrulesouticmp\"></a>`nftables::rules::out::icmp`\n\ncontrol outbound icmp packages\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::icmp` class:\n\n* [`v4_types`](#v4_types)\n* [`v6_types`](#v6_types)\n* [`order`](#order)\n\n##### <a name=\"v4_types\"></a>`v4_types`\n\nData type: `Optional[Array[String]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"v6_types\"></a>`v6_types`\n\nData type: `Optional[Array[String]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"order\"></a>`order`\n\nData type: `String`\n\n\n\nDefault value: `'10'`\n\n### <a name=\"nftablesrulesoutimap\"></a>`nftables::rules::out::imap`\n\nallow outgoing imap\n\n### <a name=\"nftablesrulesoutkerberos\"></a>`nftables::rules::out::kerberos`\n\nallows outbound access for kerberos\n\n### <a name=\"nftablesrulesoutmysql\"></a>`nftables::rules::out::mysql`\n\nmanage out mysql\n\n### <a name=\"nftablesrulesoutnfs\"></a>`nftables::rules::out::nfs`\n\nmanage out nfs\n\n### <a name=\"nftablesrulesoutnfs3\"></a>`nftables::rules::out::nfs3`\n\nmanage out nfs3\n\n### <a name=\"nftablesrulesoutopenafs_client\"></a>`nftables::rules::out::openafs_client`\n\nallows outbound access for afs clients\n7000 - afs3-fileserver\n7002 - afs3-ptserver\n7003 - vlserver\n\n* **See also**\n  * https://wiki.openafs.org/devel/AFSServicePorts/\n    * AFS Service Ports\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::openafs_client` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nport numbers to use\n\nDefault value: `[7000, 7002, 7003]`\n\n### <a name=\"nftablesrulesoutospf\"></a>`nftables::rules::out::ospf`\n\nmanage out ospf\n\n### <a name=\"nftablesrulesoutospf3\"></a>`nftables::rules::out::ospf3`\n\nmanage out ospf3\n\n### <a name=\"nftablesrulesoutpop3\"></a>`nftables::rules::out::pop3`\n\nallow outgoing pop3\n\n### <a name=\"nftablesrulesoutpostgres\"></a>`nftables::rules::out::postgres`\n\nmanage out postgres\n\n### <a name=\"nftablesrulesoutpuppet\"></a>`nftables::rules::out::puppet`\n\nmanage outgoing puppet\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::puppet` class:\n\n* [`puppetserver`](#puppetserver)\n* [`puppetserver_port`](#puppetserver_port)\n\n##### <a name=\"puppetserver\"></a>`puppetserver`\n\nData type: `Variant[Stdlib::IP::Address,Array[Stdlib::IP::Address,1]]`\n\npuppetserver hostname\n\n##### <a name=\"puppetserver_port\"></a>`puppetserver_port`\n\nData type: `Stdlib::Port`\n\npuppetserver port\n\nDefault value: `8140`\n\n### <a name=\"nftablesrulesoutpxp_agent\"></a>`nftables::rules::out::pxp_agent`\n\nmanage outgoing pxp-agent\n\n* **See also**\n  * also\n    * take a look at nftables::rules::out::puppet, because the PXP agent also connects to a Puppetserver\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::pxp_agent` class:\n\n* [`broker`](#broker)\n* [`broker_port`](#broker_port)\n\n##### <a name=\"broker\"></a>`broker`\n\nData type: `Variant[Stdlib::IP::Address,Array[Stdlib::IP::Address,1]]`\n\nPXP broker IP(s)\n\n##### <a name=\"broker_port\"></a>`broker_port`\n\nData type: `Stdlib::Port`\n\nPXP broker port\n\nDefault value: `8142`\n\n### <a name=\"nftablesrulesoutsmtp\"></a>`nftables::rules::out::smtp`\n\nallow outgoing smtp\n\n### <a name=\"nftablesrulesoutsmtp_client\"></a>`nftables::rules::out::smtp_client`\n\nallow outgoing smtp client\n\n### <a name=\"nftablesrulesoutssh\"></a>`nftables::rules::out::ssh`\n\nmanage out ssh\n\n### <a name=\"nftablesrulesoutsshremove\"></a>`nftables::rules::out::ssh::remove`\n\ndisable outgoing ssh\n\n### <a name=\"nftablesrulesouttor\"></a>`nftables::rules::out::tor`\n\nmanage out tor\n\n### <a name=\"nftablesrulesoutwhois\"></a>`nftables::rules::out::whois`\n\nallow clients to query remote whois server\n\n### <a name=\"nftablesrulesoutwireguard\"></a>`nftables::rules::out::wireguard`\n\nmanage out wireguard\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::out::wireguard` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Integer,1]`\n\nspecify wireguard ports\n\nDefault value: `[51820]`\n\n### <a name=\"nftablesrulespuppet\"></a>`nftables::rules::puppet`\n\nmanage in puppet\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::puppet` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Integer,1]`\n\npuppet server ports\n\nDefault value: `[8140]`\n\n### <a name=\"nftablesrulespxp_agent\"></a>`nftables::rules::pxp_agent`\n\nmanage in pxp-agent\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::pxp_agent` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\npxp server ports\n\nDefault value: `[8142]`\n\n### <a name=\"nftablesrulesqemu\"></a>`nftables::rules::qemu`\n\nThis class configures the typical firewall setup that libvirt\ncreates. Depending on your requirements you can switch on and off\nseveral aspects, for instance if you don't do DHCP to your guests\nyou can disable the rules that accept DHCP traffic on the host or if\nyou don't want your guests to talk to hosts outside you can disable\nforwarding and/or masquerading for IPv4 traffic.\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::qemu` class:\n\n* [`interface`](#interface)\n* [`network_v4`](#network_v4)\n* [`network_v6`](#network_v6)\n* [`dns`](#dns)\n* [`dhcpv4`](#dhcpv4)\n* [`forward_traffic`](#forward_traffic)\n* [`internal_traffic`](#internal_traffic)\n* [`masquerade`](#masquerade)\n\n##### <a name=\"interface\"></a>`interface`\n\nData type: `String[1]`\n\nInterface name used by the bridge.\n\nDefault value: `'virbr0'`\n\n##### <a name=\"network_v4\"></a>`network_v4`\n\nData type: `Stdlib::IP::Address::V4::CIDR`\n\nThe IPv4 network prefix used in the virtual network.\n\nDefault value: `'192.168.122.0/24'`\n\n##### <a name=\"network_v6\"></a>`network_v6`\n\nData type: `Optional[Stdlib::IP::Address::V6::CIDR]`\n\nThe IPv6 network prefix used in the virtual network.\n\nDefault value: ``undef``\n\n##### <a name=\"dns\"></a>`dns`\n\nData type: `Boolean`\n\nAllow DNS traffic from the guests to the host.\n\nDefault value: ``true``\n\n##### <a name=\"dhcpv4\"></a>`dhcpv4`\n\nData type: `Boolean`\n\nAllow DHCPv4 traffic from the guests to the host.\n\nDefault value: ``true``\n\n##### <a name=\"forward_traffic\"></a>`forward_traffic`\n\nData type: `Boolean`\n\nAllow forwarded traffic (out all, in related/established)\ngenerated by the virtual network.\n\nDefault value: ``true``\n\n##### <a name=\"internal_traffic\"></a>`internal_traffic`\n\nData type: `Boolean`\n\nAllow guests in the virtual network to talk to each other.\n\nDefault value: ``true``\n\n##### <a name=\"masquerade\"></a>`masquerade`\n\nData type: `Boolean`\n\nDo NAT masquerade on all IPv4 traffic generated by guests\nto external networks.\n\nDefault value: ``true``\n\n### <a name=\"nftablesrulessamba\"></a>`nftables::rules::samba`\n\nmanage Samba, the suite to allow Windows file sharing on Linux resources.\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::samba` class:\n\n* [`ctdb`](#ctdb)\n\n##### <a name=\"ctdb\"></a>`ctdb`\n\nData type: `Boolean`\n\nEnable ctdb-driven clustered Samba setups.\n\nDefault value: ``false``\n\n### <a name=\"nftablesrulessmtp\"></a>`nftables::rules::smtp`\n\nmanage in smtp\n\n### <a name=\"nftablesrulessmtp_submission\"></a>`nftables::rules::smtp_submission`\n\nmanage in smtp submission\n\n### <a name=\"nftablesrulessmtps\"></a>`nftables::rules::smtps`\n\nmanage in smtps\n\n### <a name=\"nftablesrulesssh\"></a>`nftables::rules::ssh`\n\nmanage in ssh\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::ssh` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nssh ports\n\nDefault value: `[22]`\n\n### <a name=\"nftablesrulestor\"></a>`nftables::rules::tor`\n\nmanage in tor\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::tor` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nports for tor\n\nDefault value: `[9001]`\n\n### <a name=\"nftablesruleswireguard\"></a>`nftables::rules::wireguard`\n\nmanage in wireguard\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::wireguard` class:\n\n* [`ports`](#ports)\n\n##### <a name=\"ports\"></a>`ports`\n\nData type: `Array[Stdlib::Port,1]`\n\nwiregueard port\n\nDefault value: `[51820]`\n\n### <a name=\"nftablesservicesdhcpv6_client\"></a>`nftables::services::dhcpv6_client`\n\nAllow in and outbound traffic for DHCPv6 server\n\n### <a name=\"nftablesservicesopenafs_client\"></a>`nftables::services::openafs_client`\n\nOpen inbound and outbound ports for an AFS client\n\n## Defined types\n\n### <a name=\"nftableschain\"></a>`nftables::chain`\n\nmanage a chain\n\n#### Parameters\n\nThe following parameters are available in the `nftables::chain` defined type:\n\n* [`table`](#table)\n* [`chain`](#chain)\n* [`inject`](#inject)\n* [`inject_iif`](#inject_iif)\n* [`inject_oif`](#inject_oif)\n\n##### <a name=\"table\"></a>`table`\n\nData type: `Pattern[/^(ip|ip6|inet)-[a-zA-Z0-9_]+$/]`\n\n\n\nDefault value: `'inet-filter'`\n\n##### <a name=\"chain\"></a>`chain`\n\nData type: `Pattern[/^[a-zA-Z0-9_]+$/]`\n\n\n\nDefault value: `$title`\n\n##### <a name=\"inject\"></a>`inject`\n\nData type: `Optional[Pattern[/^\\d\\d-[a-zA-Z0-9_]+$/]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"inject_iif\"></a>`inject_iif`\n\nData type: `Optional[String]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"inject_oif\"></a>`inject_oif`\n\nData type: `Optional[String]`\n\n\n\nDefault value: ``undef``\n\n### <a name=\"nftablesconfig\"></a>`nftables::config`\n\nmanage a config snippet\n\n#### Parameters\n\nThe following parameters are available in the `nftables::config` defined type:\n\n* [`tablespec`](#tablespec)\n* [`content`](#content)\n* [`source`](#source)\n* [`prefix`](#prefix)\n\n##### <a name=\"tablespec\"></a>`tablespec`\n\nData type: `Pattern[/^\\w+-\\w+$/]`\n\n\n\nDefault value: `$title`\n\n##### <a name=\"content\"></a>`content`\n\nData type: `Optional[String]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"source\"></a>`source`\n\nData type: `Optional[Variant[String,Array[String,1]]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"prefix\"></a>`prefix`\n\nData type: `String`\n\n\n\nDefault value: `'custom-'`\n\n### <a name=\"nftablesfile\"></a>`nftables::file`\n\nInsert a file into the nftables configuration\n\n#### Examples\n\n##### Include a file that includes other files\n\n```puppet\nnftables::file{'geoip':\n  content => @(EOT)\n    include \"/var/local/geoipsets/dbip/nftset/ipv4/*.ipv4\"\n    include \"/var/local/geoipsets/dbip/nftset/ipv6/*.ipv6\"\n    |EOT,\n}\n```\n\n#### Parameters\n\nThe following parameters are available in the `nftables::file` defined type:\n\n* [`label`](#label)\n* [`content`](#content)\n* [`source`](#source)\n* [`prefix`](#prefix)\n\n##### <a name=\"label\"></a>`label`\n\nData type: `String[1]`\n\nUnique name to include in filename.\n\nDefault value: `$title`\n\n##### <a name=\"content\"></a>`content`\n\nData type: `Optional[String]`\n\nThe content to place in the file.\n\nDefault value: ``undef``\n\n##### <a name=\"source\"></a>`source`\n\nData type: `Optional[Variant[String,Array[String,1]]]`\n\nA source to obtain the file content from.\n\nDefault value: ``undef``\n\n##### <a name=\"prefix\"></a>`prefix`\n\nData type: `String`\n\nPrefix of file name to be created, if left as `file-` it will be\nauto included in the main nft configuration\n\nDefault value: `'file-'`\n\n### <a name=\"nftablesrule\"></a>`nftables::rule`\n\nProvides an interface to create a firewall rule\n\n#### Examples\n\n##### add a rule named 'myhttp' to the 'default_in' chain to allow incoming traffic to TCP port 80\n\n```puppet\nnftables::rule {\n  'default_in-myhttp':\n    content => 'tcp dport 80 accept',\n}\n```\n\n##### add a rule named 'count' to the 'PREROUTING6' chain in table 'ip6 nat' to count traffic\n\n```puppet\nnftables::rule {\n  'PREROUTING6-count':\n    content => 'counter',\n    table   => 'ip6-nat'\n}\n```\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rule` defined type:\n\n* [`ensure`](#ensure)\n* [`rulename`](#rulename)\n* [`order`](#order)\n* [`table`](#table)\n* [`content`](#content)\n* [`source`](#source)\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\nShould the rule be created.\n\nDefault value: `'present'`\n\n##### <a name=\"rulename\"></a>`rulename`\n\nData type: `Nftables::RuleName`\n\nThe symbolic name for the rule and to what chain to add it. The\nformat is defined by the Nftables::RuleName type.\n\nDefault value: `$title`\n\n##### <a name=\"order\"></a>`order`\n\nData type: `Pattern[/^\\d\\d$/]`\n\nA number representing the order of the rule.\n\nDefault value: `'50'`\n\n##### <a name=\"table\"></a>`table`\n\nData type: `String`\n\nThe name of the table to add this rule to.\n\nDefault value: `'inet-filter'`\n\n##### <a name=\"content\"></a>`content`\n\nData type: `Optional[String]`\n\nThe raw statements that compose the rule represented using the nftables\nlanguage.\n\nDefault value: ``undef``\n\n##### <a name=\"source\"></a>`source`\n\nData type: `Optional[Variant[String,Array[String,1]]]`\n\nSame goal as content but sourcing the value from a file.\n\nDefault value: ``undef``\n\n### <a name=\"nftablesrulesdnat4\"></a>`nftables::rules::dnat4`\n\nmanage a ipv4 dnat rule\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::dnat4` defined type:\n\n* [`daddr`](#daddr)\n* [`port`](#port)\n* [`rulename`](#rulename)\n* [`order`](#order)\n* [`chain`](#chain)\n* [`iif`](#iif)\n* [`proto`](#proto)\n* [`dport`](#dport)\n* [`ensure`](#ensure)\n\n##### <a name=\"daddr\"></a>`daddr`\n\nData type: `Pattern[/^[12]?\\d{1,2}\\.[12]?\\d{1,2}\\.[12]?\\d{1,2}\\.[12]?\\d{1,2}$/]`\n\n\n\n##### <a name=\"port\"></a>`port`\n\nData type: `Variant[String,Stdlib::Port]`\n\n\n\n##### <a name=\"rulename\"></a>`rulename`\n\nData type: `Pattern[/^[a-zA-Z0-9_]+$/]`\n\n\n\nDefault value: `$title`\n\n##### <a name=\"order\"></a>`order`\n\nData type: `Pattern[/^\\d\\d$/]`\n\n\n\nDefault value: `'50'`\n\n##### <a name=\"chain\"></a>`chain`\n\nData type: `String[1]`\n\n\n\nDefault value: `'default_fwd'`\n\n##### <a name=\"iif\"></a>`iif`\n\nData type: `Optional[String[1]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"proto\"></a>`proto`\n\nData type: `Enum['tcp','udp']`\n\n\n\nDefault value: `'tcp'`\n\n##### <a name=\"dport\"></a>`dport`\n\nData type: `Optional[Variant[String,Stdlib::Port]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\n\n\nDefault value: `'present'`\n\n### <a name=\"nftablesrulesmasquerade\"></a>`nftables::rules::masquerade`\n\nmasquerade all outgoing traffic\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::masquerade` defined type:\n\n* [`rulename`](#rulename)\n* [`order`](#order)\n* [`chain`](#chain)\n* [`oif`](#oif)\n* [`saddr`](#saddr)\n* [`daddr`](#daddr)\n* [`proto`](#proto)\n* [`dport`](#dport)\n* [`ensure`](#ensure)\n\n##### <a name=\"rulename\"></a>`rulename`\n\nData type: `Pattern[/^[a-zA-Z0-9_]+$/]`\n\n\n\nDefault value: `$title`\n\n##### <a name=\"order\"></a>`order`\n\nData type: `Pattern[/^\\d\\d$/]`\n\n\n\nDefault value: `'70'`\n\n##### <a name=\"chain\"></a>`chain`\n\nData type: `String[1]`\n\n\n\nDefault value: `'POSTROUTING'`\n\n##### <a name=\"oif\"></a>`oif`\n\nData type: `Optional[String[1]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"saddr\"></a>`saddr`\n\nData type: `Optional[String[1]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"daddr\"></a>`daddr`\n\nData type: `Optional[String[1]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"proto\"></a>`proto`\n\nData type: `Optional[Enum['tcp','udp']]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"dport\"></a>`dport`\n\nData type: `Optional[Variant[String,Stdlib::Port]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\n\n\nDefault value: `'present'`\n\n### <a name=\"nftablesrulessnat4\"></a>`nftables::rules::snat4`\n\nmanage a ipv4 snat rule\n\n#### Parameters\n\nThe following parameters are available in the `nftables::rules::snat4` defined type:\n\n* [`snat`](#snat)\n* [`rulename`](#rulename)\n* [`order`](#order)\n* [`chain`](#chain)\n* [`oif`](#oif)\n* [`saddr`](#saddr)\n* [`proto`](#proto)\n* [`dport`](#dport)\n* [`ensure`](#ensure)\n\n##### <a name=\"snat\"></a>`snat`\n\nData type: `String[1]`\n\n\n\n##### <a name=\"rulename\"></a>`rulename`\n\nData type: `Pattern[/^[a-zA-Z0-9_]+$/]`\n\n\n\nDefault value: `$title`\n\n##### <a name=\"order\"></a>`order`\n\nData type: `Pattern[/^\\d\\d$/]`\n\n\n\nDefault value: `'70'`\n\n##### <a name=\"chain\"></a>`chain`\n\nData type: `String[1]`\n\n\n\nDefault value: `'POSTROUTING'`\n\n##### <a name=\"oif\"></a>`oif`\n\nData type: `Optional[String[1]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"saddr\"></a>`saddr`\n\nData type: `Optional[String[1]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"proto\"></a>`proto`\n\nData type: `Optional[Enum['tcp','udp']]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"dport\"></a>`dport`\n\nData type: `Optional[Variant[String,Stdlib::Port]]`\n\n\n\nDefault value: ``undef``\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\n\n\nDefault value: `'present'`\n\n### <a name=\"nftablesset\"></a>`nftables::set`\n\nmanage a named set\n\n#### Examples\n\n##### simple set\n\n```puppet\nnftables::set{'my_set':\n  type       => 'ipv4_addr',\n  flags      => ['interval'],\n  elements   => ['192.168.0.1/24', '10.0.0.2'],\n  auto_merge => true,\n}\n```\n\n#### Parameters\n\nThe following parameters are available in the `nftables::set` defined type:\n\n* [`ensure`](#ensure)\n* [`setname`](#setname)\n* [`order`](#order)\n* [`type`](#type)\n* [`table`](#table)\n* [`flags`](#flags)\n* [`timeout`](#timeout)\n* [`gc_interval`](#gc_interval)\n* [`elements`](#elements)\n* [`size`](#size)\n* [`policy`](#policy)\n* [`auto_merge`](#auto_merge)\n* [`content`](#content)\n* [`source`](#source)\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\nshould the set be created.\n\nDefault value: `'present'`\n\n##### <a name=\"setname\"></a>`setname`\n\nData type: `Pattern[/^[-a-zA-Z0-9_]+$/]`\n\nname of set, equal to to title.\n\nDefault value: `$title`\n\n##### <a name=\"order\"></a>`order`\n\nData type: `Pattern[/^\\d\\d$/]`\n\nconcat ordering.\n\nDefault value: `'10'`\n\n##### <a name=\"type\"></a>`type`\n\nData type: `Optional[Enum['ipv4_addr', 'ipv6_addr', 'ether_addr', 'inet_proto', 'inet_service', 'mark']]`\n\ntype of set.\n\nDefault value: ``undef``\n\n##### <a name=\"table\"></a>`table`\n\nData type: `Variant[String, Array[String, 1]]`\n\ntable or array of tables to add the set to.\n\nDefault value: `'inet-filter'`\n\n##### <a name=\"flags\"></a>`flags`\n\nData type: `Array[Enum['constant', 'dynamic', 'interval', 'timeout'], 0, 4]`\n\nspecify flags for set\n\nDefault value: `[]`\n\n##### <a name=\"timeout\"></a>`timeout`\n\nData type: `Optional[Integer]`\n\ntimeout in seconds\n\nDefault value: ``undef``\n\n##### <a name=\"gc_interval\"></a>`gc_interval`\n\nData type: `Optional[Integer]`\n\ngarbage collection interval.\n\nDefault value: ``undef``\n\n##### <a name=\"elements\"></a>`elements`\n\nData type: `Optional[Array[String]]`\n\ninitialize the set with some elements in it.\n\nDefault value: ``undef``\n\n##### <a name=\"size\"></a>`size`\n\nData type: `Optional[Integer]`\n\nlimits the maximum number of elements of the set.\n\nDefault value: ``undef``\n\n##### <a name=\"policy\"></a>`policy`\n\nData type: `Optional[Enum['performance', 'memory']]`\n\ndetermines set selection policy.\n\nDefault value: ``undef``\n\n##### <a name=\"auto_merge\"></a>`auto_merge`\n\nData type: `Boolean`\n\n?\n\nDefault value: ``false``\n\n##### <a name=\"content\"></a>`content`\n\nData type: `Optional[String]`\n\nspecify content of set.\n\nDefault value: ``undef``\n\n##### <a name=\"source\"></a>`source`\n\nData type: `Optional[Variant[String,Array[String,1]]]`\n\nspecify source of set.\n\nDefault value: ``undef``\n\n### <a name=\"nftablessimplerule\"></a>`nftables::simplerule`\n\nProvides a simplified interface to nftables::rule\n\n#### Examples\n\n##### allow incoming traffic from port 541 on port 543 TCP to a given IP range and count packets\n\n```puppet\nnftables::simplerule{'my_service_in':\n  action  => 'accept',\n  comment => 'allow traffic to port 543',\n  counter => true,\n  proto   => 'tcp',\n  dport   => 543,\n  daddr   => '2001:1458::/32',\n  sport   => 541,\n}\n```\n\n#### Parameters\n\nThe following parameters are available in the `nftables::simplerule` defined type:\n\n* [`ensure`](#ensure)\n* [`rulename`](#rulename)\n* [`order`](#order)\n* [`chain`](#chain)\n* [`table`](#table)\n* [`action`](#action)\n* [`comment`](#comment)\n* [`dport`](#dport)\n* [`proto`](#proto)\n* [`daddr`](#daddr)\n* [`set_type`](#set_type)\n* [`sport`](#sport)\n* [`saddr`](#saddr)\n* [`counter`](#counter)\n\n##### <a name=\"ensure\"></a>`ensure`\n\nData type: `Enum['present','absent']`\n\nShould the rule be created.\n\nDefault value: `'present'`\n\n##### <a name=\"rulename\"></a>`rulename`\n\nData type: `Nftables::SimpleRuleName`\n\nThe symbolic name for the rule to add. Defaults to the resource's title.\n\nDefault value: `$title`\n\n##### <a name=\"order\"></a>`order`\n\nData type: `Pattern[/^\\d\\d$/]`\n\nA number representing the order of the rule.\n\nDefault value: `'50'`\n\n##### <a name=\"chain\"></a>`chain`\n\nData type: `String`\n\nThe name of the chain to add this rule to.\n\nDefault value: `'default_in'`\n\n##### <a name=\"table\"></a>`table`\n\nData type: `String`\n\nThe name of the table to add this rule to.\n\nDefault value: `'inet-filter'`\n\n##### <a name=\"action\"></a>`action`\n\nData type: `Enum['accept', 'continue', 'drop', 'queue', 'return']`\n\nThe verdict for the matched traffic.\n\nDefault value: `'accept'`\n\n##### <a name=\"comment\"></a>`comment`\n\nData type: `Optional[String]`\n\nA typically human-readable comment for the rule.\n\nDefault value: ``undef``\n\n##### <a name=\"dport\"></a>`dport`\n\nData type: `Optional[Nftables::Port]`\n\nThe destination port, ports or port range.\n\nDefault value: ``undef``\n\n##### <a name=\"proto\"></a>`proto`\n\nData type: `Optional[Enum['tcp', 'tcp4', 'tcp6', 'udp', 'udp4', 'udp6']]`\n\nThe transport-layer protocol to match.\n\nDefault value: ``undef``\n\n##### <a name=\"daddr\"></a>`daddr`\n\nData type: `Optional[Nftables::Addr]`\n\nThe destination address, CIDR or set to match.\n\nDefault value: ``undef``\n\n##### <a name=\"set_type\"></a>`set_type`\n\nData type: `Enum['ip', 'ip6']`\n\nWhen using sets as saddr or daddr, the type of the set.\nUse `ip` for sets of type `ipv4_addr`.\n\nDefault value: `'ip6'`\n\n##### <a name=\"sport\"></a>`sport`\n\nData type: `Optional[Nftables::Port]`\n\nThe source port, ports or port range.\n\nDefault value: ``undef``\n\n##### <a name=\"saddr\"></a>`saddr`\n\nData type: `Optional[Nftables::Addr]`\n\nThe source address, CIDR or set to match.\n\nDefault value: ``undef``\n\n##### <a name=\"counter\"></a>`counter`\n\nData type: `Boolean`\n\nEnable traffic counters for the matched traffic.\n\nDefault value: ``false``\n\n## Data types\n\n### <a name=\"nftablesaddr\"></a>`Nftables::Addr`\n\nRepresents an address expression to be used within a rule.\n\nAlias of\n\n```puppet\nVariant[Stdlib::IP::Address::V6, Stdlib::IP::Address::V4, Nftables::Addr::Set]\n```\n\n### <a name=\"nftablesaddrset\"></a>`Nftables::Addr::Set`\n\nRepresents a set expression to be used within a rule.\n\nAlias of\n\n```puppet\nPattern[/^@[-a-zA-Z0-9_]+$/]\n```\n\n### <a name=\"nftablesport\"></a>`Nftables::Port`\n\nRepresents a port expression to be used within a rule.\n\nAlias of\n\n```puppet\nVariant[Array[Stdlib::Port, 1], Stdlib::Port, Nftables::Port::Range]\n```\n\n### <a name=\"nftablesportrange\"></a>`Nftables::Port::Range`\n\nRepresents a port range expression to be used within a rule.\n\nAlias of\n\n```puppet\nPattern[/^\\d+-\\d+$/]\n```\n\n### <a name=\"nftablesrulename\"></a>`Nftables::RuleName`\n\nRepresents a rule name to be used in a raw rule created via nftables::rule.\nIt's a dash separated string. The first component describes the chain to\nadd the rule to, the second the rule name and the (optional) third a number.\nEx: 'default_in-sshd', 'default_out-my_service-2'.\n\nAlias of\n\n```puppet\nPattern[/^[a-zA-Z0-9_]+-[a-zA-Z0-9_]+(-\\d+)?$/]\n```\n\n### <a name=\"nftablessimplerulename\"></a>`Nftables::SimpleRuleName`\n\nRepresents a simple rule name to be used in a rule created via nftables::simplerule\n\nAlias of\n\n```puppet\nPattern[/^[a-zA-Z0-9_]+(-\\d+)?$/]\n```\n\n",
+        "malware_scan": null,
+        "tasks": [],
+        "plans": [],
+        "created_at": "2022-10-25 10:43:50 -0700",
+        "updated_at": "2022-10-25 10:44:44 -0700",
+        "deleted_at": null,
+        "deleted_for": null
+      },
+      "releases": [
+        {
+          "uri": "/v3/releases/puppet-nftables-2.6.0",
+          "slug": "puppet-nftables-2.6.0",
+          "version": "2.6.0",
+          "supported": false,
+          "created_at": "2022-10-25 10:43:50 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.6.0.tar.gz",
+          "file_size": 36251
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.5.0",
+          "slug": "puppet-nftables-2.5.0",
+          "version": "2.5.0",
+          "supported": false,
+          "created_at": "2022-08-26 06:58:15 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.5.0.tar.gz",
+          "file_size": 35213
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.4.0",
+          "slug": "puppet-nftables-2.4.0",
+          "version": "2.4.0",
+          "supported": false,
+          "created_at": "2022-07-11 01:53:46 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.4.0.tar.gz",
+          "file_size": 35026
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.3.0",
+          "slug": "puppet-nftables-2.3.0",
+          "version": "2.3.0",
+          "supported": false,
+          "created_at": "2022-07-06 05:55:59 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.3.0.tar.gz",
+          "file_size": 34184
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.2.1",
+          "slug": "puppet-nftables-2.2.1",
+          "version": "2.2.1",
+          "supported": false,
+          "created_at": "2022-05-02 02:25:24 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.2.1.tar.gz",
+          "file_size": 33832
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.2.0",
+          "slug": "puppet-nftables-2.2.0",
+          "version": "2.2.0",
+          "supported": false,
+          "created_at": "2022-02-27 09:25:07 -0800",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.2.0.tar.gz",
+          "file_size": 33785
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.1.0",
+          "slug": "puppet-nftables-2.1.0",
+          "version": "2.1.0",
+          "supported": false,
+          "created_at": "2021-09-14 03:59:27 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.1.0.tar.gz",
+          "file_size": 33604
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-2.0.0",
+          "slug": "puppet-nftables-2.0.0",
+          "version": "2.0.0",
+          "supported": false,
+          "created_at": "2021-06-03 06:21:53 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-2.0.0.tar.gz",
+          "file_size": 33204
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-1.3.0",
+          "slug": "puppet-nftables-1.3.0",
+          "version": "1.3.0",
+          "supported": false,
+          "created_at": "2021-03-25 05:52:02 -0700",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-1.3.0.tar.gz",
+          "file_size": 39256
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-1.2.0",
+          "slug": "puppet-nftables-1.2.0",
+          "version": "1.2.0",
+          "supported": false,
+          "created_at": "2021-03-03 00:41:48 -0800",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-1.2.0.tar.gz",
+          "file_size": 35637
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-1.1.1",
+          "slug": "puppet-nftables-1.1.1",
+          "version": "1.1.1",
+          "supported": false,
+          "created_at": "2021-01-29 00:20:01 -0800",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-1.1.1.tar.gz",
+          "file_size": 35456
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-1.1.0",
+          "slug": "puppet-nftables-1.1.0",
+          "version": "1.1.0",
+          "supported": false,
+          "created_at": "2021-01-25 07:20:46 -0800",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-1.1.0.tar.gz",
+          "file_size": 34798
+        },
+        {
+          "uri": "/v3/releases/puppet-nftables-1.0.0",
+          "slug": "puppet-nftables-1.0.0",
+          "version": "1.0.0",
+          "supported": false,
+          "created_at": "2020-12-15 02:15:22 -0800",
+          "deleted_at": null,
+          "file_uri": "/v3/files/puppet-nftables-1.0.0.tar.gz",
+          "file_size": 31530
+        }
+      ],
+      "feedback_score": null,
+      "homepage_url": "https://github.com/voxpupuli/puppet-nftables",
+      "issues_url": "https://github.com/voxpupuli/puppet-nftables/issues"
+    }
+  ]
+}
diff --git a/swh/lister/puppet/tests/test_lister.py b/swh/lister/puppet/tests/test_lister.py
index 80e5a63..c3d1cce 100644
--- a/swh/lister/puppet/tests/test_lister.py
+++ b/swh/lister/puppet/tests/test_lister.py
@@ -1,120 +1,164 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from datetime import datetime, timedelta, timezone
+
 from swh.lister.puppet.lister import PuppetLister
 
 # flake8: noqa: B950
 
 expected_origins = [
     {
         "url": "https://forge.puppet.com/modules/electrical/file_concat",
         "artifacts": [
             {
                 "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.1.tar.gz",
                 "version": "1.0.1",
                 "filename": "electrical-file_concat-1.0.1.tar.gz",
                 "last_update": "2015-04-17T01:03:46-07:00",
                 "checksums": {
                     "md5": "74901a89544134478c2dfde5efbb7f14",
                     "sha256": "15e973613ea038d8a4f60bafe2d678f88f53f3624c02df3157c0043f4a400de6",
                 },
             },
             {
                 "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.0.tar.gz",
                 "version": "1.0.0",
                 "filename": "electrical-file_concat-1.0.0.tar.gz",
                 "last_update": "2015-04-09T12:03:13-07:00",
                 "checksums": {
                     "length": 13289,
                 },
             },
         ],
     },
     {
         "url": "https://forge.puppet.com/modules/puppetlabs/puppetdb",
         "artifacts": [
             {
                 "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-7.10.0.tar.gz",
                 "version": "7.10.0",
                 "filename": "puppetlabs-puppetdb-7.10.0.tar.gz",
                 "last_update": "2021-12-16T14:57:46-08:00",
                 "checksums": {
                     "md5": "e91a2074ca8d94a8b3ff7f6c8bbf12bc",
                     "sha256": "49b1a542fbd2a1378c16cb04809e0f88bf4f3e45979532294fb1f03f56c97fbb",
                 },
             },
             {
                 "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-7.9.0.tar.gz",
                 "version": "7.9.0",
                 "filename": "puppetlabs-puppetdb-7.9.0.tar.gz",
                 "last_update": "2021-06-24T07:48:54-07:00",
                 "checksums": {
                     "length": 42773,
                 },
             },
             {
                 "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-1.0.0.tar.gz",
                 "version": "1.0.0",
                 "filename": "puppetlabs-puppetdb-1.0.0.tar.gz",
                 "last_update": "2012-09-19T16:51:22-07:00",
                 "checksums": {
                     "length": 16336,
                 },
             },
         ],
     },
     {
         "url": "https://forge.puppet.com/modules/saz/memcached",
         "artifacts": [
             {
                 "url": "https://forgeapi.puppet.com/v3/files/saz-memcached-8.1.0.tar.gz",
                 "version": "8.1.0",
                 "filename": "saz-memcached-8.1.0.tar.gz",
                 "last_update": "2022-07-11T03:34:55-07:00",
                 "checksums": {
                     "md5": "aadf80fba5848909429eb002ee1927ea",
                     "sha256": "883d6186e91c2c3fed13ae2009c3aa596657f6707b76f1f7efc6203c6e4ae986",
                 },
             },
             {
                 "url": "https://forgeapi.puppet.com/v3/files/saz-memcached-1.0.0.tar.gz",
                 "version": "1.0.0",
                 "filename": "saz-memcached-1.0.0.tar.gz",
                 "last_update": "2011-11-20T13:40:30-08:00",
                 "checksums": {
                     "length": 2472,
                 },
             },
         ],
     },
 ]
 
 
 def test_puppet_lister(datadir, requests_mock_datadir, swh_scheduler):
     lister = PuppetLister(scheduler=swh_scheduler)
     res = lister.run()
 
     assert res.pages == 2
     assert res.origins == 1 + 1 + 1
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     assert len(scheduler_origins) == len(expected_origins)
 
     assert [
         (
             scheduled.visit_type,
             scheduled.url,
             scheduled.extra_loader_arguments["artifacts"],
         )
         for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
     ] == [
         (
             "puppet",
             expected["url"],
             expected["artifacts"],
         )
         for expected in sorted(expected_origins, key=lambda expected: expected["url"])
     ]
+
+
+def test_puppet_lister_incremental(datadir, requests_mock_datadir, swh_scheduler):
+
+    # First run
+    lister = PuppetLister(scheduler=swh_scheduler)
+    res = lister.run()
+
+    assert res.pages == 2
+    assert res.origins == 1 + 1 + 1
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    assert len(scheduler_origins) == len(expected_origins)
+
+    assert [
+        (
+            scheduled.visit_type,
+            scheduled.url,
+            scheduled.extra_loader_arguments["artifacts"],
+        )
+        for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
+    ] == [
+        (
+            "puppet",
+            expected["url"],
+            expected["artifacts"],
+        )
+        for expected in sorted(expected_origins, key=lambda expected: expected["url"])
+    ]
+
+    # Second run
+    lister = PuppetLister(scheduler=swh_scheduler)
+    # Force lister.state.last_listing_date for correct fixture loading
+
+    lister.state.last_listing_date = datetime(2022, 9, 26, 18, 0).astimezone(
+        timezone(timedelta(hours=-7))
+    )
+    res = lister.run()
+
+    assert res.pages == 1
+    assert res.origins == 1
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
index 443c21d..64f14fa 100644
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -1,177 +1,183 @@
 # Copyright (C) 2018-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
 from dataclasses import asdict, dataclass
 from datetime import datetime, timezone
 import logging
 from time import sleep
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 from xmlrpc.client import Fault, ServerProxy
 
 from tenacity.before_sleep import before_sleep_log
 
 from swh.lister.utils import http_retry
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 # Type returned by the XML-RPC changelog call:
 # package, version, release timestamp, description, serial
 ChangelogEntry = Tuple[str, str, int, str, int]
 # Manipulated package updated type which is a subset information
 # of the ChangelogEntry type: package, max release date
 PackageUpdate = Tuple[str, datetime]
 # Type returned by listing a page of results
 PackageListPage = List[PackageUpdate]
 
 
 @dataclass
 class PyPIListerState:
     """State of PyPI lister"""
 
     last_serial: Optional[int] = None
     """Last seen serial when visiting the pypi instance"""
 
 
 def _if_rate_limited(retry_state) -> bool:
     """Custom tenacity retry predicate to handle xmlrpc client error:
 
     .. code::
 
         xmlrpc.client.Fault: <Fault -32500: 'HTTPTooManyRequests: The action could not
         be performed because there were too many requests by the client. Limit may reset
         in 1 seconds.'>
 
     """
     attempt = retry_state.outcome
     return attempt.failed and isinstance(attempt.exception(), Fault)
 
 
 def pypi_url(package_name: str) -> str:
     """Build pypi url out of a package name."""
     return PyPILister.PACKAGE_URL.format(package_name=package_name)
 
 
 class PyPILister(Lister[PyPIListerState, PackageListPage]):
     """List origins from PyPI."""
 
     LISTER_NAME = "pypi"
     INSTANCE = "pypi"  # As of today only the main pypi.org is used
     PACKAGE_LIST_URL = "https://pypi.org/pypi"  # XML-RPC url
     PACKAGE_URL = "https://pypi.org/project/{package_name}/"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url=self.PACKAGE_LIST_URL,
             instance=self.INSTANCE,
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         # used as termination condition and if useful, becomes the new state when the
         # visit is done
         self.last_processed_serial: Optional[int] = None
 
     def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState:
         return PyPIListerState(last_serial=d.get("last_serial"))
 
     def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
         return asdict(state)
 
     @http_retry(
         retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
     )
     def _changelog_last_serial(self, client: ServerProxy) -> int:
         """Internal detail to allow throttling when calling the changelog last entry"""
         serial = client.changelog_last_serial()
         assert isinstance(serial, int)
         return serial
 
     @http_retry(
         retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
     )
     def _changelog_since_serial(
         self, client: ServerProxy, serial: int
     ) -> List[ChangelogEntry]:
         """Internal detail to allow throttling when calling the changelog listing"""
         sleep(1)  # to avoid the initial warning about throttling
         return client.changelog_since_serial(serial)  # type: ignore
 
     def get_pages(self) -> Iterator[PackageListPage]:
         """Iterate other changelog events per package, determine the max release date for that
            package and use that max release date as last_update. When the execution is
            done, this will also set the self.last_processed_serial attribute so we can
            finalize the state of the lister for the next visit.
 
         Yields:
             List of Tuple of (package-name, max release-date)
 
         """
         client = ServerProxy(self.url)
 
         last_processed_serial = -1
         if self.state.last_serial is not None:
             last_processed_serial = self.state.last_serial
         upstream_last_serial = self._changelog_last_serial(client)
 
         # Paginate through result of pypi, until we read everything
         while last_processed_serial < upstream_last_serial:
             updated_packages = defaultdict(list)
 
             for package, _, release_date, _, serial in self._changelog_since_serial(
                 client, last_processed_serial
             ):
                 updated_packages[package].append(release_date)
                 # Compute the max serial so we can stop when done
                 last_processed_serial = max(last_processed_serial, serial)
 
             # Returns pages of result to flush regularly
             yield [
                 (
                     pypi_url(package),
                     datetime.fromtimestamp(max(release_dates)).replace(
                         tzinfo=timezone.utc
                     ),
                 )
                 for package, release_dates in updated_packages.items()
             ]
 
         self.last_processed_serial = upstream_last_serial
 
     def get_origins_from_page(
         self, packages: PackageListPage
     ) -> Iterator[ListedOrigin]:
         """Convert a page of PyPI repositories into a list of ListedOrigins."""
         assert self.lister_obj.id is not None
 
         for origin, last_update in packages:
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin,
                 visit_type="pypi",
                 last_update=last_update,
             )
 
     def finalize(self):
         """Finalize the visit state by updating with the new last_serial if updates
         actually happened.
 
         """
         self.updated = (
             self.state
             and self.state.last_serial
             and self.last_processed_serial
             and self.state.last_serial < self.last_processed_serial
         ) or (not self.state.last_serial and self.last_processed_serial)
         if self.updated:
             self.state.last_serial = self.last_processed_serial
diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py
index 6898b8b..bb317ea 100644
--- a/swh/lister/rubygems/lister.py
+++ b/swh/lister/rubygems/lister.py
@@ -1,214 +1,236 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import base64
 from datetime import timezone
 import gzip
 import logging
 import os
 import shutil
 import subprocess
 import tarfile
 import tempfile
 from typing import Any, Dict, Iterator, Optional, Tuple
 
 from bs4 import BeautifulSoup
 import psycopg2
 from testing.postgresql import Postgresql
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 RubyGemsListerPage = Dict[str, Any]
 
 
 class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
     """Lister for RubyGems.org, the Ruby community's gem hosting service.
 
     Instead of querying rubygems.org Web API, it uses gems data from the
     daily PostreSQL database dump of rubygems. It enables to gather all
     interesting info about a gem and its release artifacts (version number,
     download URL, checksums, release date) in an efficient way and without
     flooding rubygems Web API with numerous HTTP requests (as there is more
     than 187000 gems available on 07/10/2022).
     """
 
     LISTER_NAME = "rubygems"
     VISIT_TYPE = "rubygems"
     INSTANCE = "rubygems"
 
     RUBY_GEMS_POSTGRES_DUMP_BASE_URL = (
         "https://s3-us-west-2.amazonaws.com/rubygems-dumps"
     )
     RUBY_GEMS_POSTGRES_DUMP_LIST_URL = (
         f"{RUBY_GEMS_POSTGRES_DUMP_BASE_URL}?prefix=production/public_postgresql"
     )
 
     RUBY_GEM_DOWNLOAD_URL_PATTERN = "https://rubygems.org/downloads/{gem}-{version}.gem"
     RUBY_GEM_ORIGIN_URL_PATTERN = "https://rubygems.org/gems/{gem}"
     RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN = (
         "https://rubygems.org/api/v2/rubygems/{gem}/versions/{version}.json"
     )
 
     DB_NAME = "rubygems"
     DUMP_SQL_PATH = "public_postgresql/databases/PostgreSQL.sql.gz"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             instance=self.INSTANCE,
             url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
     def get_latest_dump_file(self) -> str:
         response = self.http_request(self.RUBY_GEMS_POSTGRES_DUMP_LIST_URL)
         xml = BeautifulSoup(response.content, "xml")
         contents = xml.find_all("Contents")
         return contents[-1].find("Key").text
 
     def create_rubygems_db(
         self, postgresql: Postgresql
     ) -> Tuple[str, psycopg2._psycopg.connection]:
         logger.debug("Creating rubygems database")
 
         db_dsn = postgresql.dsn()
         db_url = postgresql.url().replace(db_dsn["database"], self.DB_NAME)
         db = psycopg2.connect(**db_dsn)
         db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         with db.cursor() as cursor:
             cursor.execute(f"CREATE DATABASE {self.DB_NAME}")
 
         db_dsn["database"] = self.DB_NAME
 
         db = psycopg2.connect(**db_dsn)
         db.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
         with db.cursor() as cursor:
             cursor.execute("CREATE EXTENSION IF NOT EXISTS hstore")
 
         return db_url, db
 
     def populate_rubygems_db(self, db_url: str):
         dump_file = self.get_latest_dump_file()
         dump_id = dump_file.split("/")[2]
 
         response = self.http_request(f"{self.url}/{dump_file}", stream=True)
 
         with tempfile.TemporaryDirectory() as temp_dir:
             logger.debug(
                 "Downloading latest rubygems database dump: %s (%s bytes)",
                 dump_id,
                 response.headers["content-length"],
             )
             dump_file = os.path.join(temp_dir, "rubygems_dump.tar")
             with open(dump_file, "wb") as dump:
                 for chunk in response.iter_content(chunk_size=1024):
                     dump.write(chunk)
 
             with tarfile.open(dump_file) as dump_tar:
                 dump_tar.extractall(temp_dir)
 
                 logger.debug("Populating rubygems database with dump %s", dump_id)
+
+                # FIXME: make this work with -v ON_ERROR_STOP=1
                 psql = subprocess.Popen(
-                    ["psql", "-q", db_url],
+                    ["psql", "--no-psqlrc", "-q", db_url],
                     stdin=subprocess.PIPE,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
                 )
 
                 # passing value of gzip.open as stdin of subprocess.run makes the process
                 # read raw data instead of decompressed data so we have to use a pipe
                 with gzip.open(os.path.join(temp_dir, self.DUMP_SQL_PATH), "rb") as sql:
                     shutil.copyfileobj(sql, psql.stdin)  # type: ignore
 
                 # denote end of read file
                 psql.stdin.close()  # type: ignore
                 psql.wait()
 
+                if psql.returncode != 0:
+                    assert psql.stdout
+                    for line in psql.stdout.readlines():
+                        logger.warning("psql out: %s", line.decode().strip())
+                    assert psql.stderr
+                    for line in psql.stderr.readlines():
+                        logger.warning("psql err: %s", line.decode().strip())
+                    raise ValueError(
+                        "Loading rubygems dump failed with exit code %s.",
+                        psql.returncode,
+                    )
+
     def get_pages(self) -> Iterator[RubyGemsListerPage]:
         # spawn a temporary postgres instance (require initdb executable in environment)
         with Postgresql() as postgresql:
             db_url, db = self.create_rubygems_db(postgresql)
             self.populate_rubygems_db(db_url)
 
             with db.cursor() as cursor:
                 cursor.execute("SELECT id, name from rubygems")
                 for gem_id, gem_name in cursor.fetchall():
                     logger.debug("Processing gem named %s", gem_name)
                     with db.cursor() as cursor_v:
                         cursor_v.execute(
                             "SELECT authors, built_at, number, sha256, size from versions "
                             "where rubygem_id = %s",
                             (gem_id,),
                         )
                         versions = [
                             {
                                 "number": number,
                                 "url": self.RUBY_GEM_DOWNLOAD_URL_PATTERN.format(
                                     gem=gem_name, version=number
                                 ),
                                 "date": built_at.replace(tzinfo=timezone.utc),
                                 "authors": authors,
                                 "sha256": (
                                     base64.decodebytes(sha256.encode()).hex()
                                     if sha256
                                     else None
                                 ),
                                 "size": size,
                             }
                             for authors, built_at, number, sha256, size in cursor_v.fetchall()
                         ]
                         if versions:
                             yield {
                                 "name": gem_name,
                                 "versions": versions,
                             }
 
     def get_origins_from_page(self, page: RubyGemsListerPage) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
 
         artifacts = []
         rubygem_metadata = []
         for version in page["versions"]:
             artifacts.append(
                 {
                     "version": version["number"],
                     "filename": version["url"].split("/")[-1],
                     "url": version["url"],
                     "checksums": (
                         {"sha256": version["sha256"]} if version["sha256"] else {}
                     ),
                     "length": version["size"],
                 }
             )
             rubygem_metadata.append(
                 {
                     "version": version["number"],
                     "date": version["date"].isoformat(),
                     "authors": version["authors"],
                     "extrinsic_metadata_url": (
                         self.RUBY_GEM_EXTRINSIC_METADATA_URL_PATTERN.format(
                             gem=page["name"], version=version["number"]
                         )
                     ),
                 }
             )
 
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             visit_type=self.VISIT_TYPE,
             url=self.RUBY_GEM_ORIGIN_URL_PATTERN.format(gem=page["name"]),
             last_update=max(version["date"] for version in page["versions"]),
             extra_loader_arguments={
                 "artifacts": artifacts,
                 "rubygem_metadata": rubygem_metadata,
             },
         )
diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
index ba8c412..234e198 100644
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -1,430 +1,436 @@
 # Copyright (C) 2021-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import dataclass, field
 import datetime
 from enum import Enum
 import logging
 import re
 from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
 from xml.etree import ElementTree
 
 from bs4 import BeautifulSoup
 import iso8601
 import lxml
 import requests
 
 from swh.core.api.classes import stream_results
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 class VcsNames(Enum):
     """Used to filter SourceForge tool names for valid VCS types"""
 
     # CVS projects are read-only
     CVS = "cvs"
     GIT = "git"
     SUBVERSION = "svn"
     MERCURIAL = "hg"
     BAZAAR = "bzr"
 
 
 VCS_NAMES = set(v.value for v in VcsNames.__members__.values())
 
 
 @dataclass
 class SourceForgeListerEntry:
     vcs: VcsNames
     url: str
     last_modified: datetime.date
 
 
 SubSitemapNameT = str
 ProjectNameT = str
 # SourceForge only offers day-level granularity, which is good enough for our purposes
 LastModifiedT = datetime.date
 
 
 @dataclass
 class SourceForgeListerState:
     """Current state of the SourceForge lister in incremental runs"""
 
     """If the subsitemap does not exist, we assume a full run of this subsitemap
     is needed. If the date is the same, we skip the subsitemap, otherwise we
     request the subsitemap and look up every project's "last modified" date
     to compare against `ListedOrigins` from the database."""
     subsitemap_last_modified: Dict[SubSitemapNameT, LastModifiedT] = field(
         default_factory=dict
     )
     """Some projects (not the majority, but still meaningful) have no VCS for us to
     archive. We need to remember a mapping of their API URL to their "last modified"
     date so we don't keep querying them needlessly every time."""
     empty_projects: Dict[str, LastModifiedT] = field(default_factory=dict)
 
 
 SourceForgeListerPage = List[SourceForgeListerEntry]
 
 MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
 SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
 
 # API resource endpoint for information about the given project.
 #
 # `namespace`: Project namespace. Very often `p`, but can be something else like
 #              `adobe`
 # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`.
 PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}"
 
 # Predictable URL for cloning (in the broad sense) a VCS registered for the project.
 #
 # Warning: does not apply to bzr repos, and Mercurial are http only, see use of this
 # constant below.
 #
 # `vcs`: VCS type, one of `VCS_NAMES`
 # `namespace`: Project namespace. Very often `p`, but can be something else like
 #              `adobe`.
 # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`.
 # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses
 #                `git` (https://git.code.sf.net/p/codeblocks/git).
 CLONE_URL_FORMAT = "https://{vcs}.code.sf.net/{namespace}/{project}/{mount_point}"
 
 PROJ_URL_RE = re.compile(
     r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?"
 )
 
 # Mapping of `(namespace, project name)` to `last modified` date.
 ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT]
 
 
 class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
     """List origins from the "SourceForge" forge."""
 
     # Part of the lister API, that identifies this lister
     LISTER_NAME = "sourceforge"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         incremental: bool = False,
         credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             url="https://sourceforge.net",
             instance="main",
             credentials=credentials,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         # Will hold the currently saved "last modified" dates to compare against our
         # requests.
         self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
 
         self.session.headers.update({"Accept": "application/json"})
         self.incremental = incremental
 
     def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
         subsitemaps = {
             k: datetime.date.fromisoformat(v)
             for k, v in d.get("subsitemap_last_modified", {}).items()
         }
         empty_projects = {
             k: datetime.date.fromisoformat(v)
             for k, v in d.get("empty_projects", {}).items()
         }
         return SourceForgeListerState(
             subsitemap_last_modified=subsitemaps, empty_projects=empty_projects
         )
 
     def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]:
         return {
             "subsitemap_last_modified": {
                 k: v.isoformat() for k, v in state.subsitemap_last_modified.items()
             },
             "empty_projects": {
                 k: v.isoformat() for k, v in state.empty_projects.items()
             },
         }
 
     def projects_last_modified(self) -> ProjectsLastModifiedCache:
         if not self.incremental:
             # No point in loading the previous results if we're doing a full run
             return {}
         if self._project_last_modified is not None:
             return self._project_last_modified
         # We know there will be at least that many origins
         stream = stream_results(
             self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000
         )
         listed_origins = dict()
         # Projects can have slashes in them if they're subprojects, but the
         # mointpoint (last component) cannot.
         url_match = re.compile(
             r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*"
         )
         bzr_url_match = re.compile(
             r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzr/([^/]+)"
         )
         cvs_url_match = re.compile(
             r"rsync://a.cvs.sourceforge.net/cvsroot/(?P<project>.+)/([^/]+)"
         )
 
         for origin in stream:
             url = origin.url
             match = url_match.match(url)
             if match is None:
                 # Could be a bzr or cvs special endpoint
                 bzr_match = bzr_url_match.match(url)
                 cvs_match = cvs_url_match.match(url)
                 matches = None
                 if bzr_match is not None:
                     matches = bzr_match.groupdict()
                 elif cvs_match is not None:
                     matches = cvs_match.groupdict()
                 assert matches
                 project = matches["project"]
                 namespace = "p"  # no special namespacing for bzr and cvs projects
             else:
                 matches = match.groupdict()
                 namespace = matches["namespace"]
                 project = matches["project"]
             # "Last modified" dates are the same across all VCS (tools, even)
             # within a project or subproject. An assertion here would be overkill.
             last_modified = origin.last_update
             assert last_modified is not None
             listed_origins[(namespace, project)] = last_modified.date()
 
         self._project_last_modified = listed_origins
         return listed_origins
 
     def get_pages(self) -> Iterator[SourceForgeListerPage]:
         """
         SourceForge has a main XML sitemap that lists its sharded sitemaps for all
         projects.
         Each XML sub-sitemap lists project pages, which are not unique per project: a
         project can have a wiki, a home, a git, an svn, etc.
         For each unique project, we query an API endpoint that lists (among
         other things) the tools associated with said project, some of which are
         the VCS used. Subprojects are considered separate projects.
         Lastly we use the information of which VCS are used to build the predictable
         clone URL for any given VCS.
         """
         sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text
         tree = ElementTree.fromstring(sitemap_contents)
 
         for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
             last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
             assert last_modified_el is not None and last_modified_el.text is not None
             last_modified = datetime.date.fromisoformat(last_modified_el.text)
             location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
             assert location is not None and location.text is not None
             sub_url = location.text
 
             if self.incremental:
                 recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)
                 if recorded_last_mod == last_modified:
                     # The entire subsitemap hasn't changed, so none of its projects
                     # have either, skip it.
                     continue
 
             self.state.subsitemap_last_modified[sub_url] = last_modified
             subsitemap_contents = self.http_request(sub_url).text
             subtree = ElementTree.fromstring(subsitemap_contents)
 
             yield from self._get_pages_from_subsitemap(subtree)
 
     def get_origins_from_page(
         self, page: SourceForgeListerPage
     ) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
         for hit in page:
             last_modified: str = str(hit.last_modified)
             last_update: datetime.datetime = iso8601.parse_date(last_modified)
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=hit.vcs.value,
                 url=hit.url,
                 last_update=last_update,
             )
 
     def _get_pages_from_subsitemap(
         self, subtree: ElementTree.Element
     ) -> Iterator[SourceForgeListerPage]:
         projects: Set[ProjectNameT] = set()
         for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
             last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
             assert last_modified_block is not None
             last_modified = last_modified_block.text
             location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc")
             assert location is not None
             project_url = location.text
             assert project_url is not None
 
             match = PROJ_URL_RE.match(project_url)
             if match:
                 matches = match.groupdict()
                 namespace = matches["namespace"]
                 if namespace == "projects":
                     # These have a `p`-namespaced counterpart, use that instead
                     continue
 
                 project = matches["project"]
                 rest = matches["rest"]
                 if rest.count("/") > 1:
                     # This is a subproject. There exists no sub-subprojects.
                     subproject_name = rest.rsplit("/", 2)[0]
                     project = f"{project}/{subproject_name}"
 
                 prev_len = len(projects)
                 projects.add(project)
 
                 if prev_len == len(projects):
                     # Already seen
                     continue
 
                 pages = self._get_pages_for_project(namespace, project, last_modified)
                 if pages:
                     yield pages
                 else:
                     logger.debug("Project '%s' does not have any VCS", project)
             else:
                 # Should almost always match, let's log it
                 # The only ones that don't match are mostly specialized one-off URLs.
                 msg = "Project URL '%s' does not match expected pattern"
                 logger.warning(msg, project_url)
 
     def _get_pages_for_project(
         self, namespace, project, last_modified
     ) -> SourceForgeListerPage:
         endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project)
         empty_project_last_modified = self.state.empty_projects.get(endpoint)
         if empty_project_last_modified is not None:
             if last_modified == empty_project_last_modified.isoformat():
                 # Project has not changed, so is still empty, meaning it has
                 # no VCS attached that we can archive.
                 logger.debug(f"Project {namespace}/{project} is still empty")
                 return []
 
         if self.incremental:
             expected = self.projects_last_modified().get((namespace, project))
 
             if expected is not None:
                 if expected.isoformat() == last_modified:
                     # Project has not changed
                     logger.debug(f"Project {namespace}/{project} has not changed")
                     return []
                 else:
                     logger.debug(f"Project {namespace}/{project} was updated")
             else:
                 msg = "New project during an incremental run: %s/%s"
                 logger.debug(msg, namespace, project)
 
         try:
             res = self.http_request(endpoint).json()
         except requests.HTTPError:
             # We've already logged in `http_request`
             return []
 
         tools = res.get("tools")
         if tools is None:
             # This rarely happens, on very old URLs
             logger.warning("Project '%s' does not have any tools", endpoint)
             return []
 
         hits = []
         for tool in tools:
             tool_name = tool["name"]
             if tool_name not in VCS_NAMES:
                 continue
             if tool_name == VcsNames.CVS.value:
                 # CVS projects are different from other VCS ones, they use the rsync
                 # protocol, a list of modules needs to be fetched from an info page
                 # and multiple origin URLs can be produced for a same project.
                 cvs_info_url = f"http://{project}.cvs.sourceforge.net"
                 try:
                     response = self.http_request(cvs_info_url)
                 except requests.HTTPError:
                     logger.warning(
                         "CVS info page could not be fetched, skipping project '%s'",
                         project,
                     )
                     continue
                 else:
                     bs = BeautifulSoup(response.text, features="html.parser")
                     cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
                     for text in [b.text for b in bs.find_all("b")]:
                         match = re.search(rf".*/cvsroot/{project} co -P (.+)", text)
                         if match is not None:
                             module = match.group(1)
                             if module != "Attic":
                                 url = f"{cvs_base_url}/{project}/{module}"
                                 hits.append(
                                     SourceForgeListerEntry(
                                         vcs=VcsNames(tool_name),
                                         url=url,
                                         last_modified=last_modified,
                                     )
                                 )
                     continue
             url = CLONE_URL_FORMAT.format(
                 vcs=tool_name,
                 namespace=namespace,
                 project=project,
                 mount_point=tool["mount_point"],
             )
             if tool_name == VcsNames.MERCURIAL.value:
                 # SourceForge does not yet support anonymous HTTPS cloning for Mercurial
                 # See https://sourceforge.net/p/forge/feature-requests/727/
                 url = url.replace("https://", "http://")
             if tool_name == VcsNames.BAZAAR.value:
                 # SourceForge has removed support for bzr and only keeps legacy projects
                 # around at a separate (also not https) URL. Bzr projects are very rare
                 # and a lot of them are 404 now.
                 url = f"http://{project}.bzr.sourceforge.net/bzr/{project}"
                 try:
                     response = self.http_request(url)
                     if "To get this branch, use:" not in response.text:
                         # If a bzr project has multiple branches, we need to extract their
                         # names from the repository landing page and create one listed origin
                         # per branch
                         parser = lxml.etree.HTMLParser()
                         tree = lxml.etree.fromstring(response.text, parser)
 
                         # Get all tds with class 'autcell'
                         tds = tree.xpath(".//td[contains(@class, 'autcell')]")
                         for td in tds:
                             branch = td.findtext("a")
                             # If the td's parent contains <img alt="Branch"/> and
                             # it has non-empty text:
                             if td.xpath("..//img[@alt='Branch']") and branch:
                                 hits.append(
                                     SourceForgeListerEntry(
                                         vcs=VcsNames(tool_name),
                                         url=f"{url}/{branch}",
                                         last_modified=last_modified,
                                     )
                                 )
                         continue
                 except requests.HTTPError:
                     logger.warning(
                         "Bazaar repository page could not be fetched, skipping project '%s'",
                         project,
                     )
                     continue
             entry = SourceForgeListerEntry(
                 vcs=VcsNames(tool_name), url=url, last_modified=last_modified
             )
             hits.append(entry)
 
         if not hits:
             date = datetime.date.fromisoformat(last_modified)
             self.state.empty_projects[endpoint] = date
         else:
             self.state.empty_projects.pop(endpoint, None)
 
         return hits
diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
index a69ec1c..a3f640b 100644
--- a/swh/lister/tests/test_cli.py
+++ b/swh/lister/tests/test_cli.py
@@ -1,63 +1,66 @@
 # Copyright (C) 2019-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import pytest
 
 from swh.lister.cli import SUPPORTED_LISTERS, get_lister
 
 lister_args = {
     "cgit": {
         "url": "https://git.eclipse.org/c/",
     },
     "phabricator": {
         "instance": "softwareheritage",
         "url": "https://forge.softwareheritage.org/api/diffusion.repository.search",
         "api_token": "bogus",
     },
     "gitea": {
         "url": "https://try.gitea.io/api/v1/",
     },
     "tuleap": {
         "url": "https://tuleap.net",
     },
     "gitlab": {
         "url": "https://gitlab.ow2.org/api/v4",
         "instance": "ow2",
     },
     "opam": {"url": "https://opam.ocaml.org", "instance": "opam"},
     "maven": {
         "url": "https://repo1.maven.org/maven2/",
         "index_url": "http://indexes/export.fld",
     },
     "gogs": {
         "url": "https://try.gogs.io/",
         "api_token": "secret",
     },
     "nixguix": {
         "url": "https://guix.gnu.org/sources.json",
         "origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/",
     },
+    "fedora": {
+        "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases//",
+    },
 }
 
 
 def test_get_lister_wrong_input():
     """Unsupported lister should raise"""
     with pytest.raises(ValueError) as e:
         get_lister("unknown", "db-url")
 
     assert "Invalid lister" in str(e.value)
 
 
 def test_get_lister(swh_scheduler_config):
     """Instantiating a supported lister should be ok"""
     # Drop launchpad lister from the lister to check, its test setup is more involved
     # than the other listers and it's not currently done here
     for lister_name in SUPPORTED_LISTERS:
         lst = get_lister(
             lister_name,
             scheduler={"cls": "local", **swh_scheduler_config},
             **lister_args.get(lister_name, {}),
         )
         assert hasattr(lst, "run")
diff --git a/swh/lister/tests/test_pattern.py b/swh/lister/tests/test_pattern.py
index 554a8d1..6dcd1d5 100644
--- a/swh/lister/tests/test_pattern.py
+++ b/swh/lister/tests/test_pattern.py
@@ -1,217 +1,318 @@
 # Copyright (C) 2020-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import TYPE_CHECKING, Any, Dict, Iterator, List
 
 import pytest
 
 from swh.lister import pattern
 from swh.scheduler.model import ListedOrigin
 
 StateType = Dict[str, str]
 OriginType = Dict[str, str]
 PageType = List[OriginType]
 
 
 class InstantiableLister(pattern.Lister[StateType, PageType]):
     """A lister that can only be instantiated, not run."""
 
     LISTER_NAME = "test-pattern-lister"
 
     def state_from_dict(self, d: Dict[str, str]) -> StateType:
         return d
 
 
 def test_instantiation(swh_scheduler):
     lister = InstantiableLister(
         scheduler=swh_scheduler, url="https://example.com", instance="example.com"
     )
 
     # check the lister was registered in the scheduler backend
     stored_lister = swh_scheduler.get_or_create_lister(
         name="test-pattern-lister", instance_name="example.com"
     )
     assert stored_lister == lister.lister_obj
 
     with pytest.raises(NotImplementedError):
         lister.run()
 
 
 def test_lister_instance_name(swh_scheduler):
     lister = InstantiableLister(
         scheduler=swh_scheduler, url="https://example.org", instance="example"
     )
 
     assert lister.instance == "example"
 
     lister = InstantiableLister(scheduler=swh_scheduler, url="https://example.org")
 
     assert lister.instance == "example.org"
 
 
 def test_instantiation_from_configfile(swh_scheduler, mocker):
     mock_load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
     mock_get_scheduler = mocker.patch("swh.lister.pattern.get_scheduler")
     mock_load_from_envvar.return_value = {
         "scheduler": {},
         "url": "foo",
         "instance": "bar",
     }
     mock_get_scheduler.return_value = swh_scheduler
 
     lister = InstantiableLister.from_configfile()
     assert lister.url == "foo"
     assert lister.instance == "bar"
 
     lister = InstantiableLister.from_configfile(url="bar", instance="foo")
     assert lister.url == "bar"
     assert lister.instance == "foo"
 
     lister = InstantiableLister.from_configfile(url=None, instance="foo")
     assert lister.url == "foo"
     assert lister.instance == "foo"
 
 
 if TYPE_CHECKING:
     _Base = pattern.Lister[Any, PageType]
 else:
     _Base = object
 
 
 class ListerMixin(_Base):
     def get_pages(self) -> Iterator[PageType]:
         for pageno in range(2):
             yield [
                 {"url": f"https://example.com/{pageno:02d}{i:03d}"} for i in range(10)
             ]
 
     def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
         for origin in page:
             yield ListedOrigin(
                 lister_id=self.lister_obj.id, url=origin["url"], visit_type="git"
             )
 
 
 def check_listed_origins(swh_scheduler, lister, stored_lister):
     """Check that the listed origins match the ones in the lister"""
     # Gather the origins that are supposed to be listed
     lister_urls = sorted(
         sum([[o["url"] for o in page] for page in lister.get_pages()], [])
     )
 
     # And check the state of origins in the scheduler
     ret = swh_scheduler.get_listed_origins()
     assert ret.next_page_token is None
     assert len(ret.results) == len(lister_urls)
 
     for origin, expected_url in zip(ret.results, lister_urls):
         assert origin.url == expected_url
         assert origin.lister_id == stored_lister.id
 
 
 class RunnableLister(ListerMixin, InstantiableLister):
     """A lister that can be run."""
 
     def state_to_dict(self, state: StateType) -> Dict[str, str]:
         return state
 
     def finalize(self) -> None:
         self.state["updated"] = "yes"
         self.updated = True
 
 
 def test_run(swh_scheduler):
     lister = RunnableLister(
         scheduler=swh_scheduler, url="https://example.com", instance="example.com"
     )
 
     assert "updated" not in lister.state
 
     update_date = lister.lister_obj.updated
 
     run_result = lister.run()
 
     assert run_result.pages == 2
     assert run_result.origins == 20
 
     stored_lister = swh_scheduler.get_or_create_lister(
         name="test-pattern-lister", instance_name="example.com"
     )
 
     # Check that the finalize operation happened
     assert stored_lister.updated > update_date
     assert stored_lister.current_state["updated"] == "yes"
 
     check_listed_origins(swh_scheduler, lister, stored_lister)
 
 
 class InstantiableStatelessLister(pattern.StatelessLister[PageType]):
     LISTER_NAME = "test-stateless-lister"
 
 
 def test_stateless_instantiation(swh_scheduler):
     lister = InstantiableStatelessLister(
         scheduler=swh_scheduler,
         url="https://example.com",
         instance="example.com",
     )
 
     # check the lister was registered in the scheduler backend
     stored_lister = swh_scheduler.get_or_create_lister(
         name="test-stateless-lister", instance_name="example.com"
     )
     assert stored_lister == lister.lister_obj
     assert stored_lister.current_state == {}
     assert lister.state is None
 
     with pytest.raises(NotImplementedError):
         lister.run()
 
 
 class RunnableStatelessLister(ListerMixin, InstantiableStatelessLister):
     def finalize(self):
         self.updated = True
 
 
 def test_stateless_run(swh_scheduler):
     lister = RunnableStatelessLister(
         scheduler=swh_scheduler, url="https://example.com", instance="example.com"
     )
 
     update_date = lister.lister_obj.updated
 
     run_result = lister.run()
 
     assert run_result.pages == 2
     assert run_result.origins == 20
 
     stored_lister = swh_scheduler.get_or_create_lister(
         name="test-stateless-lister", instance_name="example.com"
     )
 
     # Check that the finalize operation happened
     assert stored_lister.updated > update_date
     assert stored_lister.current_state == {}
 
     # And that all origins are stored
     check_listed_origins(swh_scheduler, lister, stored_lister)
 
 
 class ListerWithSameOriginInMultiplePages(RunnableStatelessLister):
     def get_pages(self) -> Iterator[PageType]:
         for _ in range(2):
             yield [{"url": "https://example.org/user/project"}]
 
 
 def test_listed_origins_count(swh_scheduler):
     lister = ListerWithSameOriginInMultiplePages(
         scheduler=swh_scheduler, url="https://example.org", instance="example.org"
     )
 
     run_result = lister.run()
 
     assert run_result.pages == 2
     assert run_result.origins == 1
+
+
+class ListerWithALotOfPagesWithALotOfOrigins(RunnableStatelessLister):
+    def get_pages(self) -> Iterator[PageType]:
+        for page in range(10):
+            yield [
+                {"url": f"https://example.org/page{page}/origin{origin}"}
+                for origin in range(10)
+            ]
+
+
+@pytest.mark.parametrize(
+    "max_pages,expected_pages",
+    [
+        (2, 2),
+        (10, 10),
+        (100, 10),
+        # The default returns all 10 pages
+        (None, 10),
+    ],
+)
+def test_lister_max_pages(swh_scheduler, max_pages, expected_pages):
+    extra_kwargs = {}
+    if max_pages is not None:
+        extra_kwargs["max_pages"] = max_pages
+
+    lister = ListerWithALotOfPagesWithALotOfOrigins(
+        scheduler=swh_scheduler,
+        url="https://example.org",
+        instance="example.org",
+        **extra_kwargs,
+    )
+
+    run_result = lister.run()
+
+    assert run_result.pages == expected_pages
+    assert run_result.origins == expected_pages * 10
+
+
+@pytest.mark.parametrize(
+    "max_origins_per_page,expected_origins_per_page",
+    [
+        (2, 2),
+        (10, 10),
+        (100, 10),
+        # The default returns all 10 origins per page
+        (None, 10),
+    ],
+)
+def test_lister_max_origins_per_page(
+    swh_scheduler, max_origins_per_page, expected_origins_per_page
+):
+    extra_kwargs = {}
+    if max_origins_per_page is not None:
+        extra_kwargs["max_origins_per_page"] = max_origins_per_page
+
+    lister = ListerWithALotOfPagesWithALotOfOrigins(
+        scheduler=swh_scheduler,
+        url="https://example.org",
+        instance="example.org",
+        **extra_kwargs,
+    )
+
+    run_result = lister.run()
+
+    assert run_result.pages == 10
+    assert run_result.origins == 10 * expected_origins_per_page
+
+
+@pytest.mark.parametrize(
+    "enable_origins,expected",
+    [
+        (True, True),
+        (False, False),
+        # default behavior is to enable all listed origins
+        (None, True),
+    ],
+)
+def test_lister_enable_origins(swh_scheduler, enable_origins, expected):
+    extra_kwargs = {}
+    if enable_origins is not None:
+        extra_kwargs["enable_origins"] = enable_origins
+
+    lister = ListerWithALotOfPagesWithALotOfOrigins(
+        scheduler=swh_scheduler,
+        url="https://example.org",
+        instance="example.org",
+        **extra_kwargs,
+    )
+
+    run_result = lister.run()
+    assert run_result.pages == 10
+    assert run_result.origins == 100
+
+    origins = swh_scheduler.get_listed_origins(
+        lister_id=lister.lister_obj.id, enabled=None
+    ).results
+
+    assert origins
+
+    assert all(origin.enabled == expected for origin in origins)
diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py
index 4a55499..ce5cadf 100644
--- a/swh/lister/tuleap/lister.py
+++ b/swh/lister/tuleap/lister.py
@@ -1,123 +1,129 @@
 # Copyright (C) 2021-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 from typing import Any, Dict, Iterator, List, Optional
 from urllib.parse import urljoin
 
 import iso8601
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, StatelessLister
 
 logger = logging.getLogger(__name__)
 
 RepoPage = Dict[str, Any]
 
 
 class TuleapLister(StatelessLister[RepoPage]):
     """List origins from Tuleap.
 
     Tuleap provides SVN and Git repositories hosting.
 
     Tuleap API getting started:
     https://tuleap.net/doc/en/user-guide/integration/rest.html
     Tuleap API reference:
     https://tuleap.net/api/explorer/
 
     Using the API we first request a list of projects, and from there request their
     associated repositories individually. Everything is paginated, code uses throttling
     at the individual GET call level."""
 
     LISTER_NAME = "tuleap"
 
     REPO_LIST_PATH = "/api"
     REPO_GIT_PATH = "plugins/git/"
     REPO_SVN_PATH = "plugins/svn/"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: Optional[str] = None,
         credentials: CredentialsType = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=url,
             instance=instance,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
         )
 
         self.session.headers.update({"Accept": "application/json"})
 
     @classmethod
     def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage:
         if repo_type == "git":
             prefix_url = TuleapLister.REPO_GIT_PATH
         else:
             prefix_url = TuleapLister.REPO_SVN_PATH
         rep = {
             "project": repo["name"],
             "type": repo_type,
             "uri": urljoin(url, f"{prefix_url}{repo['path']}"),
             "last_update_date": repo["last_update_date"],
         }
         return rep
 
     def _get_repositories(self, url_repo) -> List[Dict[str, Any]]:
         ret = self.http_request(url_repo)
         reps_list = ret.json()["repositories"]
         limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"])
         offset = int(ret.headers["X-PAGINATION-LIMIT"])
         size = int(ret.headers["X-PAGINATION-SIZE"])
         while offset < size:
             url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit)
             ret = self.http_request(url_offset).json()
             reps_list = reps_list + ret["repositories"]
             offset += limit
         return reps_list
 
     def get_pages(self) -> Iterator[RepoPage]:
         # base with trailing slash, path without leading slash for urljoin
         url_api: str = urljoin(self.url, self.REPO_LIST_PATH)
         url_projects = url_api + "/projects/"
 
         # Get the list of projects.
         response = self.http_request(url_projects)
         projects_list = response.json()
         limit = int(response.headers["X-PAGINATION-LIMIT-MAX"])
         offset = int(response.headers["X-PAGINATION-LIMIT"])
         size = int(response.headers["X-PAGINATION-SIZE"])
         while offset < size:
             url_offset = (
                 url_projects + "?offset=" + str(offset) + "&limit=" + str(limit)
             )
             ret = self.http_request(url_offset).json()
             projects_list = projects_list + ret
             offset += limit
 
         # Get list of repositories for each project.
         for p in projects_list:
             p_id = p["id"]
 
             # Fetch Git repositories for project
             url_git = url_projects + str(p_id) + "/git"
             repos = self._get_repositories(url_git)
             for repo in repos:
                 yield self.results_simplified(url_api, "git", repo)
 
     def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
         """Convert a page of Tuleap repositories into a list of ListedOrigins."""
         assert self.lister_obj.id is not None
 
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             url=page["uri"],
             visit_type=page["type"],
             last_update=iso8601.parse_date(page["last_update_date"]),
         )