diff --git a/PKG-INFO b/PKG-INFO
index 950c53c..6877e13 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,126 +1,127 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 2.3.0
+Version: 2.4.0
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
+- `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
-`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`)
+`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
 
 
diff --git a/README.md b/README.md
index 5b5b27e..ae43250 100644
--- a/README.md
+++ b/README.md
@@ -1,101 +1,102 @@
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
+- `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
-`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`)
+`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
diff --git a/mypy.ini b/mypy.ini
index 8aab2fa..b5a4295 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,38 +1,42 @@
 [mypy]
 namespace_packages = True
 warn_unused_ignores = True
 
 # 3rd party libraries without stubs (yet)
 
 [mypy-bs4.*]
 ignore_missing_imports = True
 
 [mypy-celery.*]
 ignore_missing_imports = True
 
 [mypy-debian.*]
 ignore_missing_imports = True
 
 [mypy-iso8601.*]
 ignore_missing_imports = True
 
 [mypy-launchpadlib.*]
 ignore_missing_imports = True
 
 [mypy-lazr.*]
 ignore_missing_imports = True
 
 [mypy-pkg_resources.*]
 ignore_missing_imports = True
 
 [mypy-pytest.*]
 ignore_missing_imports = True
 
 [mypy-pytest_postgresql.*]
 ignore_missing_imports = True
 
 [mypy-requests_mock.*]
 ignore_missing_imports = True
 
 [mypy-urllib3.util.*]
 ignore_missing_imports = True
+
+[mypy-xmltodict.*]
+ignore_missing_imports = True
+
diff --git a/requirements.txt b/requirements.txt
index 4f6c24e..8d9bb82 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
 tenacity
+xmltodict
diff --git a/setup.py b/setup.py
index 78aa42d..3460bc8 100755
--- a/setup.py
+++ b/setup.py
@@ -1,88 +1,89 @@
 #!/usr/bin/env python3
 # Copyright (C) 2015-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from io import open
 from os import path
 
 from setuptools import find_packages, setup
 
 here = path.abspath(path.dirname(__file__))
 
 # Get the long description from the README file
 with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 
 def parse_requirements(name=None):
     if name:
         reqf = "requirements-%s.txt" % name
     else:
         reqf = "requirements.txt"
 
     requirements = []
     if not path.exists(reqf):
         return requirements
 
     with open(reqf) as f:
         for line in f.readlines():
             line = line.strip()
             if not line or line.startswith("#"):
                 continue
             requirements.append(line)
     return requirements
 
 
 setup(
     name="swh.lister",
     description="Software Heritage lister",
     long_description=long_description,
     long_description_content_type="text/markdown",
     python_requires=">=3.7",
     author="Software Heritage developers",
     author_email="swh-devel@inria.fr",
     url="https://forge.softwareheritage.org/diffusion/DLSGH/",
     packages=find_packages(),
     install_requires=parse_requirements() + parse_requirements("swh"),
     tests_require=parse_requirements("test"),
     setup_requires=["setuptools-scm"],
     extras_require={"testing": parse_requirements("test")},
     use_scm_version=True,
     include_package_data=True,
     entry_points="""
         [swh.cli.subcommands]
         lister=swh.lister.cli
         [swh.workers]
         lister.bitbucket=swh.lister.bitbucket:register
         lister.cgit=swh.lister.cgit:register
         lister.cran=swh.lister.cran:register
         lister.debian=swh.lister.debian:register
         lister.gitea=swh.lister.gitea:register
         lister.github=swh.lister.github:register
         lister.gitlab=swh.lister.gitlab:register
         lister.gnu=swh.lister.gnu:register
         lister.launchpad=swh.lister.launchpad:register
         lister.npm=swh.lister.npm:register
         lister.opam=swh.lister.opam:register
         lister.packagist=swh.lister.packagist:register
         lister.phabricator=swh.lister.phabricator:register
         lister.pypi=swh.lister.pypi:register
         lister.sourceforge=swh.lister.sourceforge:register
         lister.tuleap=swh.lister.tuleap:register
+        lister.maven=swh.lister.maven:register
     """,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
         "Operating System :: OS Independent",
         "Development Status :: 5 - Production/Stable",
     ],
     project_urls={
         "Bug Reports": "https://forge.softwareheritage.org/maniphest",
         "Funding": "https://www.softwareheritage.org/donate",
         "Source": "https://forge.softwareheritage.org/source/swh-lister",
         "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/",
     },
 )
diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO
index 950c53c..6877e13 100644
--- a/swh.lister.egg-info/PKG-INFO
+++ b/swh.lister.egg-info/PKG-INFO
@@ -1,126 +1,127 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 2.3.0
+Version: 2.4.0
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
+- `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
-`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`)
+`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
 
 
diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt
index 0fc97ff..d0db159 100644
--- a/swh.lister.egg-info/SOURCES.txt
+++ b/swh.lister.egg-info/SOURCES.txt
@@ -1,234 +1,246 @@
 .gitignore
 .pre-commit-config.yaml
 ACKNOWLEDGEMENTS
 CODE_OF_CONDUCT.md
 CONTRIBUTORS
 LICENSE
 MANIFEST.in
 Makefile
 README.md
 conftest.py
 mypy.ini
 pyproject.toml
 pytest.ini
 requirements-swh.txt
 requirements-test.txt
 requirements.txt
 setup.cfg
 setup.py
 tox.ini
 docs/.gitignore
 docs/Makefile
 docs/cli.rst
 docs/conf.py
 docs/index.rst
 docs/new_lister_template.py
 docs/run_a_new_lister.rst
 docs/save_forge.rst
 docs/tutorial.rst
 docs/_static/.placeholder
 docs/_templates/.placeholder
 docs/images/new_base.png
 docs/images/new_bitbucket_lister.png
 docs/images/new_github_lister.png
 docs/images/old_github_lister.png
 sql/crawler.sql
 sql/pimp_db.sql
 swh/__init__.py
 swh.lister.egg-info/PKG-INFO
 swh.lister.egg-info/SOURCES.txt
 swh.lister.egg-info/dependency_links.txt
 swh.lister.egg-info/entry_points.txt
 swh.lister.egg-info/requires.txt
 swh.lister.egg-info/top_level.txt
 swh/lister/__init__.py
 swh/lister/cli.py
 swh/lister/pattern.py
 swh/lister/py.typed
 swh/lister/utils.py
 swh/lister/bitbucket/__init__.py
 swh/lister/bitbucket/lister.py
 swh/lister/bitbucket/tasks.py
 swh/lister/bitbucket/tests/__init__.py
 swh/lister/bitbucket/tests/test_lister.py
 swh/lister/bitbucket/tests/test_tasks.py
 swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json
 swh/lister/bitbucket/tests/data/bb_api_repositories_page2.json
 swh/lister/cgit/__init__.py
 swh/lister/cgit/lister.py
 swh/lister/cgit/tasks.py
 swh/lister/cgit/tests/__init__.py
 swh/lister/cgit/tests/repo_list.txt
 swh/lister/cgit/tests/test_lister.py
 swh/lister/cgit/tests/test_tasks.py
 swh/lister/cgit/tests/data/https_git.baserock.org/cgit
 swh/lister/cgit/tests/data/https_git.eclipse.org/c
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/README
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit_elisp-es.git
 swh/lister/cgit/tests/data/https_git.tizen/README
 swh/lister/cgit/tests/data/https_git.tizen/cgit
 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=100
 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=50
 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Projects
 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Users
 swh/lister/cgit/tests/data/https_git.tizen/cgit_Lock-Projects
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-base
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-mc1n2
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e3250
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e4x12
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_devices_nfc-plugin-nxp
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_intel_mfld_bootstub-mfld-blackbay
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_mtdev
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_opengl-es-virtual-drv
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libdrm
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libnl
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_xorg_driver_xserver-xorg-misc
 swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-gallery-efl
 swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-homescreen-efl
 swh/lister/cgit/tests/data/https_jff.email/cgit
 swh/lister/cran/__init__.py
 swh/lister/cran/list_all_packages.R
 swh/lister/cran/lister.py
 swh/lister/cran/tasks.py
 swh/lister/cran/tests/__init__.py
 swh/lister/cran/tests/test_lister.py
 swh/lister/cran/tests/test_tasks.py
 swh/lister/cran/tests/data/list-r-packages.json
 swh/lister/debian/__init__.py
 swh/lister/debian/lister.py
 swh/lister/debian/tasks.py
 swh/lister/debian/tests/__init__.py
 swh/lister/debian/tests/test_lister.py
 swh/lister/debian/tests/test_tasks.py
 swh/lister/debian/tests/data/Sources_bullseye
 swh/lister/debian/tests/data/Sources_buster
 swh/lister/debian/tests/data/Sources_stretch
 swh/lister/gitea/__init__.py
 swh/lister/gitea/lister.py
 swh/lister/gitea/tasks.py
 swh/lister/gitea/tests/__init__.py
 swh/lister/gitea/tests/test_lister.py
 swh/lister/gitea/tests/test_tasks.py
 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1
 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2
 swh/lister/github/__init__.py
 swh/lister/github/lister.py
 swh/lister/github/tasks.py
 swh/lister/github/tests/__init__.py
 swh/lister/github/tests/test_lister.py
 swh/lister/github/tests/test_tasks.py
 swh/lister/gitlab/__init__.py
 swh/lister/gitlab/lister.py
 swh/lister/gitlab/tasks.py
 swh/lister/gitlab/tests/__init__.py
 swh/lister/gitlab/tests/test_lister.py
 swh/lister/gitlab/tests/test_tasks.py
 swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json
 swh/lister/gitlab/tests/data/https_gitlab.com/api_response_page1.json
 swh/lister/gnu/__init__.py
 swh/lister/gnu/lister.py
 swh/lister/gnu/tasks.py
 swh/lister/gnu/tree.py
 swh/lister/gnu/tests/__init__.py
 swh/lister/gnu/tests/test_lister.py
 swh/lister/gnu/tests/test_tasks.py
 swh/lister/gnu/tests/test_tree.py
 swh/lister/gnu/tests/data/tree.json
 swh/lister/gnu/tests/data/tree.min.json
 swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz
 swh/lister/launchpad/__init__.py
 swh/lister/launchpad/lister.py
 swh/lister/launchpad/tasks.py
 swh/lister/launchpad/tests/__init__.py
 swh/lister/launchpad/tests/conftest.py
 swh/lister/launchpad/tests/test_lister.py
 swh/lister/launchpad/tests/test_tasks.py
 swh/lister/launchpad/tests/data/launchpad_response1.json
 swh/lister/launchpad/tests/data/launchpad_response2.json
+swh/lister/maven/README.md
+swh/lister/maven/__init__.py
+swh/lister/maven/lister.py
+swh/lister/maven/tasks.py
+swh/lister/maven/tests/__init__.py
+swh/lister/maven/tests/test_lister.py
+swh/lister/maven/tests/test_tasks.py
+swh/lister/maven/tests/data/http_indexes/export.fld
+swh/lister/maven/tests/data/http_indexes/export_incr.fld
+swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom
+swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
+swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
 swh/lister/npm/__init__.py
 swh/lister/npm/lister.py
 swh/lister/npm/tasks.py
 swh/lister/npm/tests/test_lister.py
 swh/lister/npm/tests/test_tasks.py
 swh/lister/npm/tests/data/npm_full_page1.json
 swh/lister/npm/tests/data/npm_full_page2.json
 swh/lister/npm/tests/data/npm_incremental_page1.json
 swh/lister/npm/tests/data/npm_incremental_page2.json
 swh/lister/opam/__init__.py
 swh/lister/opam/lister.py
 swh/lister/opam/tasks.py
 swh/lister/opam/tests/__init__.py
 swh/lister/opam/tests/test_lister.py
 swh/lister/opam/tests/test_tasks.py
 swh/lister/opam/tests/data/fake_opam_repo/repo
 swh/lister/opam/tests/data/fake_opam_repo/version
 swh/lister/opam/tests/data/fake_opam_repo/packages/agrid/agrid.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.2/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.3/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.4/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.5/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.6/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.2/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.3/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/ocb/ocb.0.1/opam
 swh/lister/packagist/__init__.py
 swh/lister/packagist/lister.py
 swh/lister/packagist/tasks.py
 swh/lister/packagist/tests/__init__.py
 swh/lister/packagist/tests/test_lister.py
 swh/lister/packagist/tests/test_tasks.py
 swh/lister/packagist/tests/data/den1n_contextmenu.json
 swh/lister/packagist/tests/data/ljjackson_linnworks.json
 swh/lister/packagist/tests/data/lky_wx_article.json
 swh/lister/packagist/tests/data/spryker-eco_computop-api.json
 swh/lister/phabricator/__init__.py
 swh/lister/phabricator/lister.py
 swh/lister/phabricator/tasks.py
 swh/lister/phabricator/tests/__init__.py
 swh/lister/phabricator/tests/test_lister.py
 swh/lister/phabricator/tests/test_tasks.py
 swh/lister/phabricator/tests/data/__init__.py
 swh/lister/phabricator/tests/data/phabricator_api_repositories_page1.json
 swh/lister/phabricator/tests/data/phabricator_api_repositories_page2.json
 swh/lister/pypi/__init__.py
 swh/lister/pypi/lister.py
 swh/lister/pypi/tasks.py
 swh/lister/pypi/tests/__init__.py
 swh/lister/pypi/tests/test_lister.py
 swh/lister/pypi/tests/test_tasks.py
 swh/lister/sourceforge/__init__.py
 swh/lister/sourceforge/lister.py
 swh/lister/sourceforge/tasks.py
 swh/lister/sourceforge/tests/__init__.py
 swh/lister/sourceforge/tests/test_lister.py
 swh/lister/sourceforge/tests/test_tasks.py
 swh/lister/sourceforge/tests/data/adobexmp.json
 swh/lister/sourceforge/tests/data/backapps-website.json
 swh/lister/sourceforge/tests/data/backapps.json
 swh/lister/sourceforge/tests/data/main-sitemap.xml
 swh/lister/sourceforge/tests/data/mojunk.json
 swh/lister/sourceforge/tests/data/mramm.json
 swh/lister/sourceforge/tests/data/os3dmodels.json
 swh/lister/sourceforge/tests/data/random-mercurial.json
 swh/lister/sourceforge/tests/data/subsitemap-0.xml
 swh/lister/sourceforge/tests/data/subsitemap-1.xml
 swh/lister/tests/__init__.py
 swh/lister/tests/test_cli.py
 swh/lister/tests/test_pattern.py
 swh/lister/tests/test_utils.py
 swh/lister/tuleap/__init__.py
 swh/lister/tuleap/lister.py
 swh/lister/tuleap/tasks.py
 swh/lister/tuleap/tests/__init__.py
 swh/lister/tuleap/tests/test_lister.py
 swh/lister/tuleap/tests/test_tasks.py
 swh/lister/tuleap/tests/data/https_tuleap.net/projects
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_1
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_2
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_3
\ No newline at end of file
diff --git a/swh.lister.egg-info/entry_points.txt b/swh.lister.egg-info/entry_points.txt
index 244c95b..840f4d6 100644
--- a/swh.lister.egg-info/entry_points.txt
+++ b/swh.lister.egg-info/entry_points.txt
@@ -1,21 +1,22 @@
 
         [swh.cli.subcommands]
         lister=swh.lister.cli
         [swh.workers]
         lister.bitbucket=swh.lister.bitbucket:register
         lister.cgit=swh.lister.cgit:register
         lister.cran=swh.lister.cran:register
         lister.debian=swh.lister.debian:register
         lister.gitea=swh.lister.gitea:register
         lister.github=swh.lister.github:register
         lister.gitlab=swh.lister.gitlab:register
         lister.gnu=swh.lister.gnu:register
         lister.launchpad=swh.lister.launchpad:register
         lister.npm=swh.lister.npm:register
         lister.opam=swh.lister.opam:register
         lister.packagist=swh.lister.packagist:register
         lister.phabricator=swh.lister.phabricator:register
         lister.pypi=swh.lister.pypi:register
         lister.sourceforge=swh.lister.sourceforge:register
         lister.tuleap=swh.lister.tuleap:register
+        lister.maven=swh.lister.maven:register
     
\ No newline at end of file
diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt
index 23b6ceb..847447b 100644
--- a/swh.lister.egg-info/requires.txt
+++ b/swh.lister.egg-info/requires.txt
@@ -1,17 +1,18 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
 tenacity
+xmltodict
 swh.core[db]>=0.9
 swh.scheduler>=0.8
 
 [testing]
 pytest
 pytest-mock
 requests_mock
 types-click
 types-pyyaml
 types-requests
diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py
index 165bbc2..b5979f5 100644
--- a/swh/lister/debian/lister.py
+++ b/swh/lister/debian/lister.py
@@ -1,294 +1,298 @@
 # Copyright (C) 2017-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 
 import bz2
 from collections import defaultdict
 from dataclasses import dataclass, field
 import gzip
 from itertools import product
 import logging
 import lzma
+import os
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple
 from urllib.parse import urljoin
 
 from debian.deb822 import Sources
 import requests
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 decompressors: Dict[str, Callable[[Any], Any]] = {
     "gz": lambda f: gzip.GzipFile(fileobj=f),
     "bz2": bz2.BZ2File,
     "xz": lzma.LZMAFile,
 }
 
 Suite = str
 Component = str
 PkgName = str
 PkgVersion = str
 DebianOrigin = str
 DebianPageType = Iterator[Sources]
 
 
 @dataclass
 class DebianListerState:
     """State of debian lister"""
 
     package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
     """Dictionary mapping a package name to all the versions found during
     last listing"""
 
 
 class DebianLister(Lister[DebianListerState, DebianPageType]):
     """
     List source packages for a given debian or derivative distribution.
 
     The lister will create a snapshot for each package name from all its
     available versions.
 
     If a package snapshot is different from the last listing operation,
     it will be send to the scheduler that will create a loading task
     to archive newly found source code.
 
     Args:
         scheduler: instance of SchedulerInterface
         distribution: identifier of listed distribution (e.g. Debian, Ubuntu)
         mirror_url: debian package archives mirror URL
         suites: list of distribution suites to process
         components: list of package components to process
     """
 
     LISTER_NAME = "debian"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         distribution: str = "Debian",
         mirror_url: str = "http://deb.debian.org/debian/",
         suites: List[Suite] = ["stretch", "buster", "bullseye"],
         components: List[Component] = ["main", "contrib", "non-free"],
         credentials: Optional[CredentialsType] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             url=mirror_url,
             instance=distribution,
             credentials=credentials,
         )
 
         # to ensure urljoin will produce valid Sources URL
         if not self.url.endswith("/"):
             self.url += "/"
 
         self.distribution = distribution
         self.suites = suites
         self.components = components
 
         self.session = requests.Session()
         self.session.headers.update({"User-Agent": USER_AGENT})
 
         # will hold all listed origins info
         self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {}
         # will contain origin urls that have already been listed
         # in a previous page
         self.sent_origins: Set[DebianOrigin] = set()
         # will contain already listed package info that need to be sent
         # to the scheduler for update in the commit_page method
         self.origins_to_update: Dict[DebianOrigin, ListedOrigin] = {}
         # will contain the lister state after a call to run
         self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
 
     def state_from_dict(self, d: Dict[str, Any]) -> DebianListerState:
         return DebianListerState(package_versions={k: set(v) for k, v in d.items()})
 
     def state_to_dict(self, state: DebianListerState) -> Dict[str, Any]:
         return {k: list(v) for k, v in state.package_versions.items()}
 
     def debian_index_urls(
         self, suite: Suite, component: Component
     ) -> Iterator[Tuple[str, str]]:
         """Return an iterator on possible Sources file URLs as multiple compression
         formats can be used."""
         compression_exts = ("xz", "bz2", "gz")
         base_urls = [
             urljoin(self.url, f"dists/{suite}/{component}/source/Sources"),
             urljoin(self.url, f"dists/{suite}/updates/{component}/source/Sources"),
         ]
         for base_url, ext in product(base_urls, compression_exts):
             yield (f"{base_url}.{ext}", ext)
         yield (base_url, "")
 
     def page_request(self, suite: Suite, component: Component) -> DebianPageType:
         """Return parsed package Sources file for a given debian suite and component."""
         for url, compression in self.debian_index_urls(suite, component):
             response = requests.get(url, stream=True)
             logging.debug("Fetched URL: %s, status code: %s", url, response.status_code)
             if response.status_code == 200:
                 break
         else:
             raise Exception(
                 "Could not retrieve sources index for %s/%s", suite, component
             )
 
         decompressor = decompressors.get(compression)
         if decompressor:
             data = decompressor(response.raw)
         else:
             data = response.raw
 
         return Sources.iter_paragraphs(data.readlines())
 
     def get_pages(self) -> Iterator[DebianPageType]:
         """Return an iterator on parsed debian package Sources files, one per combination
         of debian suite and component."""
         for suite, component in product(self.suites, self.components):
             logger.debug(
                 "Processing %s %s source packages info for %s component.",
                 self.instance,
                 suite,
                 component,
             )
             self.current_suite = suite
             self.current_component = component
             yield self.page_request(suite, component)
 
     def origin_url_for_package(self, package_name: PkgName) -> DebianOrigin:
         """Return the origin url for the given package"""
         return f"deb://{self.instance}/packages/{package_name}"
 
     def get_origins_from_page(self, page: DebianPageType) -> Iterator[ListedOrigin]:
         """Convert a page of debian package sources into an iterator of ListedOrigin.
 
         Please note that the returned origins correspond to packages only
         listed for the first time in order to get an accurate origins counter
         in the statistics returned by the run method of the lister.
 
         Packages already listed in another page but with different versions will
         be put in cache by the method and updated ListedOrigin objects will
         be sent to the scheduler later in the commit_page method.
 
         Indeed as multiple debian suites can be processed, a similar set of
         package names can be listed for two different package source pages,
         only their version will differ, resulting in origins counted multiple
         times in lister statistics.
         """
         assert self.lister_obj.id is not None
 
         origins_to_send = {}
         self.origins_to_update = {}
 
         # iterate on each package source info
         for src_pkg in page:
             # gather package files info that will be used by the debian loader
             files: Dict[str, Dict[str, Any]] = defaultdict(dict)
             for field_ in src_pkg._multivalued_fields:
                 if field_.startswith("checksums-"):
                     sum_name = field_[len("checksums-") :]
                 else:
                     sum_name = "md5sum"
                 if field_ in src_pkg:
                     for entry in src_pkg[field_]:
                         name = entry["name"]
-                        files[name]["name"] = entry["name"]
+                        files[name]["name"] = name
                         files[name]["size"] = int(entry["size"], 10)
                         files[name][sum_name] = entry[sum_name]
+                        files[name]["uri"] = os.path.join(
+                            self.url, src_pkg["Directory"], name
+                        )
 
             # extract package name and version
             package_name = src_pkg["Package"]
             package_version = src_pkg["Version"]
             # build origin url
             origin_url = self.origin_url_for_package(package_name)
 
             # create package version key as expected by the debian loader
             package_version_key = (
                 f"{self.current_suite}/{self.current_component}/{package_version}"
             )
 
             # this is the first time a package is listed
             if origin_url not in self.listed_origins:
                 # create a ListedOrigin object for it that can be later
                 # updated with new package versions info
                 self.listed_origins[origin_url] = ListedOrigin(
                     lister_id=self.lister_obj.id,
                     url=origin_url,
                     visit_type="deb",
-                    extra_loader_arguments={"date": None, "packages": {}},
+                    extra_loader_arguments={"packages": {}},
                 )
                 # origin will be yielded at the end of that method
                 origins_to_send[origin_url] = self.listed_origins[origin_url]
                 # init set that will contain all listed package versions
                 self.package_versions[package_name] = set()
 
             # package has already been listed in a previous page or current page
             elif origin_url not in origins_to_send:
                 # if package has been listed in a previous page, its new versions
                 # will be added to its ListedOrigin object but the update will
                 # be sent to the scheduler in the commit_page method
                 self.origins_to_update[origin_url] = self.listed_origins[origin_url]
 
             # update package versions data in parameter that will be provided
             # to the debian loader
             self.listed_origins[origin_url].extra_loader_arguments["packages"].update(
                 {
                     package_version_key: {
                         "name": package_name,
                         "version": package_version,
                         "files": files,
                     }
                 }
             )
 
             # add package version key to the set of found versions
             self.package_versions[package_name].add(package_version_key)
 
             # package has already been listed during a previous listing process
             if package_name in self.state.package_versions:
                 new_versions = (
                     self.package_versions[package_name]
                     - self.state.package_versions[package_name]
                 )
                 # no new versions so far, no need to send the origin to the scheduler
                 if not new_versions:
                     origins_to_send.pop(origin_url, None)
                     self.origins_to_update.pop(origin_url, None)
                 # new versions found, ensure the origin will be sent to the scheduler
                 elif origin_url not in self.sent_origins:
                     self.origins_to_update.pop(origin_url, None)
                     origins_to_send[origin_url] = self.listed_origins[origin_url]
 
         # update already counted origins with changes since last page
         self.sent_origins.update(origins_to_send.keys())
 
         logger.debug(
             "Found %s new packages, %s packages with new versions.",
             len(origins_to_send),
             len(self.origins_to_update),
         )
         logger.debug(
             "Current total number of listed packages is equal to %s.",
             len(self.listed_origins),
         )
 
         yield from origins_to_send.values()
 
     def get_origins_to_update(self) -> Iterator[ListedOrigin]:
         yield from self.origins_to_update.values()
 
     def commit_page(self, page: DebianPageType):
         """Send to scheduler already listed origins where new versions have been found
         in current page."""
         self.send_origins(self.get_origins_to_update())
 
     def finalize(self):
         # set mapping between listed package names and versions as lister state
         self.state.package_versions = self.package_versions
         self.updated = len(self.sent_origins) > 0
diff --git a/swh/lister/debian/tests/test_lister.py b/swh/lister/debian/tests/test_lister.py
index 2dc5c6d..754adb5 100644
--- a/swh/lister/debian/tests/test_lister.py
+++ b/swh/lister/debian/tests/test_lister.py
@@ -1,232 +1,241 @@
 # Copyright (C) 2019-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
+import os
 from pathlib import Path
 from typing import Dict, List, Set, Tuple
 
 from debian.deb822 import Sources
 import pytest
 
 from swh.lister.debian.lister import (
     DebianLister,
     DebianOrigin,
     PkgName,
     PkgVersion,
     Suite,
 )
 from swh.scheduler.interface import SchedulerInterface
 
 # Those tests use sample debian Sources files whose content has been extracted
 # from the real Sources files from stretch, buster and bullseye suite.
 # They contain the follwowing package source info
 #  - stretch:
 #      * dh-elpa (versions: 0.0.18, 0.0.19, 0.0.20),
 #      * git (version: 1:2.11.0-3+deb9u7)
 #  - buster:
 #      * git (version: 1:2.20.1-2+deb10u3),
 #      * subversion (version: 1.10.4-1+deb10u1)
 #  - bullseye:
 #      * git (version: 1:2.29.2-1)
 #      * subversion (version: 1.14.0-3)
 #      * hg-git (version: 0.9.0-2)
 
 _mirror_url = "http://deb.debian.org/debian"
 _suites = ["stretch", "buster", "bullseye"]
 _components = ["main"]
 
 SourcesText = str
 
 
 def _debian_sources_content(datadir: str, suite: Suite) -> SourcesText:
     return Path(datadir, f"Sources_{suite}").read_text()
 
 
 @pytest.fixture
 def debian_sources(datadir: str) -> Dict[Suite, SourcesText]:
     return {suite: _debian_sources_content(datadir, suite) for suite in _suites}
 
 
 # suite -> package name -> list of versions
 DebianSuitePkgSrcInfo = Dict[Suite, Dict[PkgName, List[Sources]]]
 
 
 def _init_test(
     swh_scheduler: SchedulerInterface,
     debian_sources: Dict[Suite, SourcesText],
     requests_mock,
 ) -> Tuple[DebianLister, DebianSuitePkgSrcInfo]:
     lister = DebianLister(
         scheduler=swh_scheduler,
         mirror_url=_mirror_url,
         suites=list(debian_sources.keys()),
         components=_components,
     )
 
     suite_pkg_info: DebianSuitePkgSrcInfo = {}
 
     for suite, sources in debian_sources.items():
         suite_pkg_info[suite] = defaultdict(list)
         for pkg_src in Sources.iter_paragraphs(sources):
             suite_pkg_info[suite][pkg_src["Package"]].append(pkg_src)
 
         for idx_url, compression in lister.debian_index_urls(suite, _components[0]):
             if compression:
                 requests_mock.get(idx_url, status_code=404)
             else:
                 requests_mock.get(idx_url, text=sources)
 
     return lister, suite_pkg_info
 
 
 def _check_listed_origins(
     swh_scheduler: SchedulerInterface,
     lister: DebianLister,
     suite_pkg_info: DebianSuitePkgSrcInfo,
     lister_previous_state: Dict[PkgName, Set[PkgVersion]],
 ) -> Set[DebianOrigin]:
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     origin_urls = set()
 
     # iterate on each debian suite for the main component
     for suite, pkg_info in suite_pkg_info.items():
         # iterate on each package
         for package_name, pkg_srcs in pkg_info.items():
             # iterate on each package version info
             for pkg_src in pkg_srcs:
                 # build package version key
                 package_version_key = f"{suite}/{_components[0]}/{pkg_src['Version']}"
                 # if package or its version not previously listed, those info should
                 # have been sent to the scheduler database
                 if (
                     package_name not in lister_previous_state
                     or package_version_key not in lister_previous_state[package_name]
                 ):
                     # build origin url
                     origin_url = lister.origin_url_for_package(package_name)
                     origin_urls.add(origin_url)
                     # get ListerOrigin object from scheduler database
                     filtered_origins = [
                         scheduler_origin
                         for scheduler_origin in scheduler_origins
                         if scheduler_origin.url == origin_url
                     ]
 
                     assert filtered_origins
+                    packages = filtered_origins[0].extra_loader_arguments["packages"]
                     # check the version info are available
-                    assert (
-                        package_version_key
-                        in filtered_origins[0].extra_loader_arguments["packages"]
-                    )
+                    assert package_version_key in packages
+
+                    # check package files URIs are available
+                    for file in pkg_src["files"]:
+                        filename = file["name"]
+                        file_uri = os.path.join(
+                            _mirror_url, pkg_src["Directory"], filename
+                        )
+                        package_files = packages[package_version_key]["files"]
+                        assert filename in package_files
+                        assert package_files[filename]["uri"] == file_uri
 
                     # check listed package version is in lister state
                     assert package_name in lister.state.package_versions
                     assert (
                         package_version_key
                         in lister.state.package_versions[package_name]
                     )
     return origin_urls
 
 
 def test_lister_debian_all_suites(
     swh_scheduler: SchedulerInterface,
     debian_sources: Dict[Suite, SourcesText],
     requests_mock,
 ):
     """
     Simulate a full listing of main component packages for all debian suites.
     """
     lister, suite_pkg_info = _init_test(swh_scheduler, debian_sources, requests_mock)
 
     stats = lister.run()
 
     origin_urls = _check_listed_origins(
         swh_scheduler, lister, suite_pkg_info, lister_previous_state={}
     )
 
     assert stats.pages == len(_suites) * len(_components)
     assert stats.origins == len(origin_urls)
 
     stats = lister.run()
 
     assert stats.pages == len(_suites) * len(_components)
     assert stats.origins == 0
 
 
 @pytest.mark.parametrize(
     "suites_params",
     [[_suites[:1]], [_suites[:1], _suites[:2]], [_suites[:1], _suites[:2], _suites],],
 )
 def test_lister_debian_updated_packages(
     swh_scheduler: SchedulerInterface,
     debian_sources: Dict[Suite, SourcesText],
     requests_mock,
     suites_params: List[Suite],
 ):
     """
     Simulate incremental listing of main component packages by adding new suite
     to process between each listing operation.
     """
 
     lister_previous_state: Dict[PkgName, Set[PkgVersion]] = {}
 
     for idx, suites in enumerate(suites_params):
 
         sources = {suite: debian_sources[suite] for suite in suites}
 
         lister, suite_pkg_info = _init_test(swh_scheduler, sources, requests_mock)
 
         stats = lister.run()
 
         origin_urls = _check_listed_origins(
             swh_scheduler,
             lister,
             suite_pkg_info,
             lister_previous_state=lister_previous_state,
         )
 
         assert stats.pages == len(sources)
         assert stats.origins == len(origin_urls)
 
         lister_previous_state = lister.state.package_versions
 
         # only new packages or packages with new versions should be listed
         if len(suites) > 1 and idx < len(suites) - 1:
             assert stats.origins == 0
         else:
             assert stats.origins != 0
 
 
 @pytest.mark.parametrize(
     "credentials, expected_credentials",
     [
         (None, []),
         ({"key": "value"}, []),
         (
             {"debian": {"Debian": [{"username": "user", "password": "pass"}]}},
             [{"username": "user", "password": "pass"}],
         ),
     ],
 )
 def test_lister_debian_instantiation_with_credentials(
     credentials, expected_credentials, swh_scheduler
 ):
     lister = DebianLister(swh_scheduler, credentials=credentials)
 
     # Credentials are allowed in constructor
     assert lister.credentials == expected_credentials
 
 
 def test_lister_debian_from_configfile(swh_scheduler_config, mocker):
     load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
     load_from_envvar.return_value = {
         "scheduler": {"cls": "local", **swh_scheduler_config},
         "credentials": {},
     }
     lister = DebianLister.from_configfile()
     assert lister.scheduler is not None
     assert lister.credentials is not None
diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py
index bbb1f63..f4246a2 100644
--- a/swh/lister/github/lister.py
+++ b/swh/lister/github/lister.py
@@ -1,351 +1,357 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import datetime
 import logging
 import random
 import time
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Set
 from urllib.parse import parse_qs, urlparse
 
 import iso8601
 import requests
 from tenacity import (
     retry,
     retry_any,
     retry_if_exception_type,
     retry_if_result,
     wait_exponential,
 )
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 def init_session(session: Optional[requests.Session] = None) -> requests.Session:
     """Initialize a requests session with the proper headers for requests to
     GitHub."""
     if not session:
         session = requests.Session()
 
     session.headers.update(
         {"Accept": "application/vnd.github.v3+json", "User-Agent": USER_AGENT}
     )
 
     return session
 
 
 class RateLimited(Exception):
     def __init__(self, response):
         self.reset_time: Optional[int]
 
         # Figure out how long we need to sleep because of that rate limit
         ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
         retry_after = response.headers.get("Retry-After")
         if ratelimit_reset is not None:
             self.reset_time = int(ratelimit_reset)
         elif retry_after is not None:
             self.reset_time = int(time.time()) + int(retry_after) + 1
         else:
             logger.warning(
                 "Received a rate-limit-like status code %s, but no rate-limit "
                 "headers set. Response content: %s",
                 response.status_code,
                 response.content,
             )
             self.reset_time = None
         self.response = response
 
 
 @retry(
     wait=wait_exponential(multiplier=1, min=4, max=10),
     retry=retry_any(
         # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
         # when running the lister on a connection with high latency
         retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
         # 502 status codes happen for a Server Error, sometimes
         retry_if_result(lambda r: r.status_code == 502),
     ),
 )
 def github_request(
     url: str, token: Optional[str] = None, session: Optional[requests.Session] = None
 ) -> requests.Response:
     session = init_session(session)
 
     headers = {}
     if token:
         headers["Authorization"] = f"token {token}"
 
     response = session.get(url, headers=headers)
 
     anonymous = token is None and "Authorization" not in session.headers
 
     if (
         # GitHub returns inconsistent status codes between unauthenticated
         # rate limit and authenticated rate limits. Handle both.
         response.status_code == 429
         or (anonymous and response.status_code == 403)
     ):
         raise RateLimited(response)
 
     return response
 
 
 @dataclass
 class GitHubListerState:
     """State of the GitHub lister"""
 
     last_seen_id: int = 0
     """Numeric id of the last repository listed on an incremental pass"""
 
 
 class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
     """List origins from GitHub.
 
     By default, the lister runs in incremental mode: it lists all repositories,
     starting with the `last_seen_id` stored in the scheduler backend.
 
     Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in
     that mode, the lister finds the origins present in the range **excluding**
     `first_id` and **including** `last_id`. In this mode, the lister can overrun the
     `last_id`: it will always record all the origins seen in a given page. As the lister
     is fully idempotent, this is not a practical problem. Once relisting completes, the
     lister state in the scheduler backend is not updated.
 
     When the config contains a set of credentials, we shuffle this list at the beginning
     of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the
     same token over and over again, until its rate limit runs out. Once that happens, we
     switch to the next token over in our shuffled list.
 
     When a request fails with a rate limit exception for all tokens, we pause the
     listing until the largest value for X-Ratelimit-Reset over all tokens.
 
     When the credentials aren't set in the lister config, the lister can run in
     anonymous mode too (e.g. for testing purposes).
 
     .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
 
 
     Args:
       first_id: the id of the first repo to list
       last_id: stop listing after seeing a repo with an id higher than this value.
 
     """  # noqa: E501
 
     LISTER_NAME = "github"
 
     API_URL = "https://api.github.com/repositories"
     PAGE_SIZE = 1000
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
         first_id: Optional[int] = None,
         last_id: Optional[int] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.API_URL,
             instance="github",
         )
 
         self.first_id = first_id
         self.last_id = last_id
 
         self.relisting = self.first_id is not None or self.last_id is not None
 
         self.session = init_session()
 
         random.shuffle(self.credentials)
 
         self.anonymous = not self.credentials
 
         if self.anonymous:
             logger.warning("No tokens set in configuration, using anonymous mode")
 
         self.token_index = -1
         self.current_user: Optional[str] = None
 
         if not self.anonymous:
             # Initialize the first token value in the session headers
             self.set_next_session_token()
 
     def set_next_session_token(self) -> None:
         """Update the current authentication token with the next one in line."""
 
         self.token_index = (self.token_index + 1) % len(self.credentials)
 
         auth = self.credentials[self.token_index]
         if "password" in auth:
             token = auth["password"]
         else:
             token = auth["token"]
 
         self.current_user = auth["username"]
         logger.debug("Using authentication token for user %s", self.current_user)
 
         self.session.headers.update({"Authorization": f"token {token}"})
 
     def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
         return GitHubListerState(**d)
 
     def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
         current_id = 0
         if self.first_id is not None:
             current_id = self.first_id
         elif self.state is not None:
             current_id = self.state.last_seen_id
 
         current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}"
 
         while self.last_id is None or current_id < self.last_id:
             logger.debug("Getting page %s", current_url)
 
             # The following for/else loop handles rate limiting; if successful,
             # it provides the rest of the function with a `response` object.
             #
             # If all tokens are rate-limited, we sleep until the reset time,
             # then `continue` into another iteration of the outer while loop,
             # attempting to get data from the same URL again.
 
             max_attempts = 1 if self.anonymous else len(self.credentials)
             reset_times: Dict[int, int] = {}  # token index -> time
             for attempt in range(max_attempts):
                 try:
                     response = github_request(current_url, session=self.session)
                     break
                 except RateLimited as e:
                     reset_info = "(unknown reset)"
                     if e.reset_time is not None:
                         reset_times[self.token_index] = e.reset_time
                         reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
 
                     if not self.anonymous:
                         logger.info(
                             "Rate limit exhausted for current user %s %s",
                             self.current_user,
                             reset_info,
                         )
                         # Use next token in line
                         self.set_next_session_token()
                         # Wait one second to avoid triggering GitHub's abuse rate limits
                         time.sleep(1)
             else:
                 # All tokens have been rate-limited. What do we do?
 
                 if not reset_times:
                     logger.warning(
                         "No X-Ratelimit-Reset value found in responses for any token; "
                         "Giving up."
                     )
                     break
 
                 sleep_time = max(reset_times.values()) - time.time() + 1
                 logger.info(
                     "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
                     sleep_time,
                 )
                 time.sleep(sleep_time)
                 # This goes back to the outer page-by-page loop, doing one more
                 # iteration on the same page
                 continue
 
             # We've successfully retrieved a (non-ratelimited) `response`. We
             # still need to check it for validity.
 
             if response.status_code != 200:
                 logger.warning(
                     "Got unexpected status_code %s: %s",
                     response.status_code,
                     response.content,
                 )
                 break
 
             yield response.json()
 
             if "next" not in response.links:
                 # No `next` link, we've reached the end of the world
                 logger.debug(
                     "No next link found in the response headers, all caught up"
                 )
                 break
 
             # GitHub strongly advises to use the next link directly. We still
             # parse it to get the id of the last repository we've reached so
             # far.
             next_url = response.links["next"]["url"]
             parsed_url = urlparse(next_url)
             if not parsed_url.query:
                 logger.warning("Failed to parse url %s", next_url)
                 break
 
             parsed_query = parse_qs(parsed_url.query)
             current_id = int(parsed_query["since"][0])
             current_url = next_url
 
     def get_origins_from_page(
         self, page: List[Dict[str, Any]]
     ) -> Iterator[ListedOrigin]:
         """Convert a page of GitHub repositories into a list of ListedOrigins.
 
         This records the html_url, as well as the pushed_at value if it exists.
         """
         assert self.lister_obj.id is not None
 
+        seen_in_page: Set[str] = set()
+
         for repo in page:
             if not repo:
                 # null repositories in listings happen sometimes...
                 continue
 
+            if repo["html_url"] in seen_in_page:
+                continue
+            seen_in_page.add(repo["html_url"])
+
             pushed_at_str = repo.get("pushed_at")
             pushed_at: Optional[datetime.datetime] = None
             if pushed_at_str:
                 pushed_at = iso8601.parse_date(pushed_at_str)
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=repo["html_url"],
                 visit_type="git",
                 last_update=pushed_at,
             )
 
     def commit_page(self, page: List[Dict[str, Any]]):
         """Update the currently stored state using the latest listed page"""
         if self.relisting:
             # Don't update internal state when relisting
             return
 
         if not page:
             # Sometimes, when you reach the end of the world, GitHub returns an empty
             # page of repositories
             return
 
         last_id = page[-1]["id"]
 
         if last_id > self.state.last_seen_id:
             self.state.last_seen_id = last_id
 
     def finalize(self):
         if self.relisting:
             return
 
         # Pull fresh lister state from the scheduler backend
         scheduler_state = self.get_state_from_scheduler()
 
         # Update the lister state in the backend only if the last seen id of
         # the current run is higher than that stored in the database.
         if self.state.last_seen_id > scheduler_state.last_seen_id:
             self.updated = True
diff --git a/swh/lister/maven/README.md b/swh/lister/maven/README.md
new file mode 100644
index 0000000..316c7f3
--- /dev/null
+++ b/swh/lister/maven/README.md
@@ -0,0 +1,142 @@
+
+## The Maven lister
+
+This readme describes the design decisions made during development.
+
+More information can be found on the Software Heritage forge at [https://forge.softwareheritage.org/T1724](https://forge.softwareheritage.org/T1724) and on the diff of the lister at [https://forge.softwareheritage.org/D6133](https://forge.softwareheritage.org/D6133) .
+
+## Execution sequence (TL;DR)
+
+The complete sequence of actions to list the source artifacts and scm urls is as follows:
+
+On the `index_exporter` server (asynchronously):
+
+* Check the list of remote indexes, and compare it to the list of local index files.
+* Retrieve the missing Maven Indexer indexes from the remote repository. \
+  Example of index from Maven Central: [https://repo1.maven.org/maven2/.index/](https://repo1.maven.org/maven2/.index/)
+* Start execution of the Docker container:
+  * If the `indexes` directory doesn't exist, unpack the Lucene indexes from the Maven Indexer indexes using `indexer-cli`.\
+    This generates a set of binary files as shown below:
+
+    ```
+    boris@castalia:maven$ ls -lh /media/home2/work/indexes/
+    total 5,2G
+    -rw-r--r-- 1 root root 500M juil.  7 22:06 _4m.fdt
+    -rw-r--r-- 1 root root 339K juil.  7 22:06 _4m.fdx
+    -rw-r--r-- 1 root root 2,2K juil.  7 22:07 _4m.fnm
+    -rw-r--r-- 1 root root 166M juil.  7 22:07 _4m_Lucene50_0.doc
+    -rw-r--r-- 1 root root 147M juil.  7 22:07 _4m_Lucene50_0.pos
+    -rw-r--r-- 1 root root 290M juil.  7 22:07 _4m_Lucene50_0.time
+    -rw-r--r-- 1 root root 3,1M juil.  7 22:07 _4m_Lucene50_0.tip
+    [SNIP]
+    -rw-r--r-- 1 root root  363 juil.  7 22:06 _e0.si
+    -rw-r--r-- 1 root root 1,7K juil.  7 22:07 segments_2
+    -rw-r--r-- 1 root root    8 juil.  7 21:54 timestamp
+    -rw-r--r-- 1 root root    0 juil.  7 21:54 write.lock
+    ```
+  * If the `export` directory doesn't exist, export the Lucene documents from the Lucene indexes using `clue`.\
+    This generates a set of text files as shown below:
+
+    ```
+    boris@castalia:~$ ls -lh /work/export/
+    total 49G
+    -rw-r--r-- 1 root root  13G juil.  7 22:12 _p.fld
+    -rw-r--r-- 1 root root 7,0K juil.  7 22:21 _p.inf
+    -rw-r--r-- 1 root root 2,9G juil.  7 22:21 _p.len
+    -rw-r--r-- 1 root root  33G juil.  7 22:20 _p.pst
+    -rw-r--r-- 1 root root  799 juil.  7 22:21 _p.si
+    -rw-r--r-- 1 root root  138 juil.  7 22:21 segments_1
+    -rw-r--r-- 1 root root    0 juil.  7 22:07 write.lock
+    ```
+* On the host, copy export files  to `/var/www/html/` to make them available on the network.
+
+On the lister side:
+
+* Get the exports from the above local index server.
+* Extract the list of all pom and source artefacts from the Lucene export.
+* Yield the list of source artefacts to the Maven Loader as they are found.
+* Download all poms from the above list.
+* Parse all poms to extract the scm attribute, and yield the list of scm urls towards the classic loaders (git, svn, hg..).
+
+The process has been optimised as much as it could be, scaling down from 140 GB on disk / 60 GB RAM / 90 mn exec time to 60 GB on disk / 2 GB (excl. docker) / 32 mn exec time.
+
+For the long read about why we came to here, please continue.
+
+## About the Maven ecosystem
+
+Maven repositories are a loose, decentralised network of HTTP servers with a well-defined hosted structure. They are used according to the Maven dependency resolver[i](#sdendnote1sym), an inheritance-based mechanism used to identify and locate artefacts required in Maven builds.
+
+There is no uniform, standardised way to list the contents of maven repositories, since consumers are supposed to know what artefacts they need. Instead, Maven repository owners usually setup a Maven Indexer[ii](#sdendnote2sym) to enablesource code identification and listing in IDEs – for this reason, source jars usually don’t have build files and information, only providing pure sources.
+
+Maven Indexer is not a mandatory part of the maven repository stack, but it is the *de facto* standard for maven repositories indexing and querying. All major Maven repositories we have seen so far use it. Most artefacts are located in the main central repository: Maven Central[iii](#sdendnote3sym), hosted and run by Sonatype[iv](#sdendnote4sym). Other well-known repositories are listed on MVN Repository[v](#sdendnote5sym).
+
+Maven repositories are mainly used for binary content (e.g. class jars), but the following sources of information are relevant to our goal in the maven repositories/ecosystem:
+
+* SCM attributes in pom XML files contain the **scm URL** of the associated source code. They can be fed to standard Git/SVN/others loaders.
+* **Source artefacts** contain pure source code (i.e. no build files) associated to the artefact. There are two main naming conventions for them, although not always enforced:
+  * ${artifactId}-${version}-source-release.zip
+  * ${artifactId}-${version}-src.zip
+
+  They come in various archiving formats (jar, zip, tar.bz2, tar.gz) and require a specific loader to attach the artefact metadata.
+
+[i](#sdendnote1anc)Maven dependency resolver: [https://maven.apache.org/resolver/index.html](https://maven.apache.org/resolver/index.html)
+
+[ii](#sdendnote2anc)Maven Indexer: [https://maven.apache.org/maven-indexer/](https://maven.apache.org/maven-indexer/)
+
+[iii](#sdendnote3anc)Maven Central: [https://search.maven.org/](https://search.maven.org/)
+
+[iv](#sdendnote4anc)Sonatype Company: [https://www.sonatype.com/](https://www.sonatype.com/)
+
+[v](#sdendnote5anc)MVN Repository: [https://mvnrepository.com/repos](https://mvnrepository.com/repos)
+
+## Preliminary research
+
+Listing the full content of a Maven repository is very unusual, and the whole system has not been built for this purpose. Instead, tools and build systems can easily fetch individual artefacts according to their Maven coordinates (groupId, artifactId, version, classifier, extension). Usual listing means (e.g. scapping) are highly discouraged and will trigger bannishment easily. There is no common API defined either.
+
+Once we have the artifactId/group we can easily get the list of versions (e.g. for updates) by reading the [maven-metadata.xml file at the package level](https://repo1.maven.org/maven2/ant/ant/maven-metadata.xml), although this is not always reliable. The various options that were investigated to get the interesting artefacts are:
+
+* **Scrapping** could work but is explicitly forbidden[i](#sdendnote1sym). Pages could easily be parsed through, and it would allow to identify \*all\* artifacts.
+* Using **Maven indexes** is the "official" way to retrieve information from a maven repository and most repositories provide this feature. It would also enable a smart incremental listing. The Maven Indexer data format however is not we
+  ll documented. It relies under the hood on an old version (Lucene54) of a lucene indexes, and the only libraries that can access it are written in java. This implies a dedicated Docker container with a jvm and some specific tools (maven indexer and luke for the lucene index), and thus would bring some complexity to the docker & prod setups.
+* A third path could be to **parse all the pom.xml's** that we find and follow all artifactId's recursively, building a graph of dependencies and parent poms. This is more of a non-complete heuristic, and we would miss leaf nodes (i.e. artifacts that are not used by others), but it could help setup a basic list.
+* It should be noted also that there are two main implementations of maven repositories: Nexus and Artifactory. By being more specific we could use the respective APIs of these products to get information. But getting the full list of artefacts is still not straightforward, and we'd lose any generic treatment doing so.
+
+The best option in our opinion is to go with the Maven Indexer, for it is the most complete listing available (notably for the biggest repository by far: maven central).
+
+[i](#sdendnote1anc)Maven repository’s Terms of Service: [https://repo1.maven.org/terms.html](https://repo1.maven.org/terms.html)
+
+## Maven indexes conversion
+
+[Maven-Indexer](https://maven.apache.org/maven-indexer/) is a (thick) wrapper around lucene. It parses the repository and stores documents, fields and terms in an index. One can extract the lucene index from a maven index using the command: `java -jar indexer-cli-5.1.1.jar --unpack nexus-maven-repository-index.gz --destination test --type full`. Note however that 5.1.1 is an old version of maven indexer; newer versions of the maven indexer won't work on the central indexes.
+
+[Clue](https://maven.apache.org/maven-indexer/) is a CLI tool to read lucene indexes, and version 6.2.0 works with our maven indexes. One can use the following command to export the index to text: `java -jar clue-6.2.0-1.0.0.jar maven/central-lucene-index/ export central_export text`.
+
+The exported text file looks like this:
+
+```
+doc 0
+  field 0
+    name u
+    type string
+    value com.redhat.rhevm.api|rhevm-api-powershell-jaxrs|1.0-rc1.16|javadoc|jar
+  field 1
+    name m
+    type string
+    value 1321264789727
+  field 2
+    name i
+    type string
+    value jar|1320743675000|768291|2|2|1|jar
+  field 10
+    name n
+    type string
+    value RHEV-M API Powershell Wrapper Implementation JAX-RS
+  field 13
+    name 1
+    type string
+    value 454eb6762e5bb14a75a21ae611ce2048dd548550
+```
+
+The execution of these two jars requires a Java virtual machine -- java execution in python is not possible without a JVM. Docker is a good way to run both tools and generate the exports independently, rather than add a JVM to the existing production environment.
+
+We decided (2021-08-25) to install and execute a docker container on a separate server so the lister would simply have to fetch it on the network and parse it (the latter part in pure python).
diff --git a/swh/lister/maven/__init__.py b/swh/lister/maven/__init__.py
new file mode 100644
index 0000000..b26b3cb
--- /dev/null
+++ b/swh/lister/maven/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2021 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+    from .lister import MavenLister
+
+    return {
+        "lister": MavenLister,
+        "task_modules": ["%s.tasks" % __name__],
+    }
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
new file mode 100644
index 0000000..01f6060
--- /dev/null
+++ b/swh/lister/maven/lister.py
@@ -0,0 +1,341 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import asdict, dataclass
+import logging
+import re
+from typing import Any, Dict, Iterator, Optional
+from urllib.parse import urljoin
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+from urllib3.util import parse_url
+import xmltodict
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, Lister
+
+logger = logging.getLogger(__name__)
+
+RepoPage = Dict[str, Any]
+
+
+@dataclass
+class MavenListerState:
+    """State of the MavenLister"""
+
+    last_seen_doc: int = -1
+    """Last doc ID ingested during an incremental pass
+
+    """
+
+    last_seen_pom: int = -1
+    """Last doc ID related to a pom and ingested during
+       an incremental pass
+
+    """
+
+
+class MavenLister(Lister[MavenListerState, RepoPage]):
+    """List origins from a Maven repository.
+
+    Maven Central provides artifacts for Java builds.
+    It includes POM files and source archives, which we download to get
+    the source code of artifacts and links to their scm repository.
+
+    This lister yields origins of types: git/svn/hg or whatever the Artifacts
+    use as repository type, plus maven types for the maven loader (tgz, jar)."""
+
+    LISTER_NAME = "maven"
+
+    def __init__(
+        self,
+        scheduler: SchedulerInterface,
+        url: str,
+        index_url: str = None,
+        instance: Optional[str] = None,
+        credentials: CredentialsType = None,
+        incremental: bool = True,
+    ):
+        """Lister class for Maven repositories.
+
+        Args:
+            url: main URL of the Maven repository, i.e. url of the base index
+                used to fetch maven artifacts. For Maven central use
+                https://repo1.maven.org/maven2/
+            index_url: the URL to download the exported text indexes from.
+                Would typically be a local host running the export docker image.
+                See README.md in this directory for more information.
+            instance: Name of maven instance. Defaults to url's network location
+                if unset.
+            incremental: bool, defaults to True. Defines if incremental listing
+                is activated or not.
+
+        """
+        self.BASE_URL = url
+        self.INDEX_URL = index_url
+        self.incremental = incremental
+
+        if instance is None:
+            instance = parse_url(url).host
+
+        super().__init__(
+            scheduler=scheduler, credentials=credentials, url=url, instance=instance,
+        )
+
+        self.session = requests.Session()
+        self.session.headers.update(
+            {"Accept": "application/json", "User-Agent": USER_AGENT,}
+        )
+
+    def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
+        return MavenListerState(**d)
+
+    def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
+        return asdict(state)
+
+    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+    def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+        logger.info("Fetching URL %s with params %s", url, params)
+
+        response = self.session.get(url, params=params)
+        if response.status_code != 200:
+            logger.warning(
+                "Unexpected HTTP status code %s on %s: %s",
+                response.status_code,
+                response.url,
+                response.content,
+            )
+        response.raise_for_status()
+
+        return response
+
+    def get_pages(self) -> Iterator[RepoPage]:
+        """ Retrieve and parse exported maven indexes to
+        identify all pom files and src archives.
+        """
+
+        # Example of returned RepoPage's:
+        # [
+        #   {
+        #     "type": "maven",
+        #     "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
+        #     "time": 1626109619335,
+        #     "gid": "org.xwiki.platform",
+        #     "aid": "xwiki-platform-wikistream-events-xwiki",
+        #     "version": "5.4.2"
+        #   },
+        #   {
+        #     "type": "scm",
+        #     "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
+        #     "project": "openengsb-framework",
+        #   },
+        #   ...
+        # ]
+
+        # Download the main text index file.
+        logger.info(f"Downloading text index from {self.INDEX_URL}.")
+        assert self.INDEX_URL is not None
+        response = requests.get(self.INDEX_URL, stream=True)
+        response.raise_for_status()
+
+        # Prepare regexes to parse index exports.
+
+        # Parse doc id.
+        # Example line: "doc 13"
+        re_doc = re.compile(r"^doc (?P<doc>\d+)$")
+
+        # Parse gid, aid, version, classifier, extension.
+        # Example line: "    value al.aldi|sprova4j|0.1.0|sources|jar"
+        re_val = re.compile(
+            r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"
+            + r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"
+        )
+
+        # Parse last modification time.
+        # Example line: "    value jar|1626109619335|14316|2|2|0|jar"
+        re_time = re.compile(
+            r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
+            + r"\|([^|]+)\|([^|]+)$"
+        )
+
+        # Read file line by line and process it
+        out_pom: Dict = {}
+        jar_src: Dict = {}
+        doc_id: int = 0
+        jar_src["doc"] = None
+        url_src = None
+
+        iterator = response.iter_lines(chunk_size=1024)
+        for line_bytes in iterator:
+            # Read the index text export and get URLs and SCMs.
+            line = line_bytes.decode(errors="ignore")
+            m_doc = re_doc.match(line)
+            if m_doc is not None:
+                doc_id = int(m_doc.group("doc"))
+                if (
+                    self.incremental
+                    and self.state
+                    and self.state.last_seen_doc
+                    and self.state.last_seen_doc >= doc_id
+                ):
+                    # jar_src["doc"] contains the id of the current document, whatever
+                    # its type (scm or jar).
+                    jar_src["doc"] = None
+                else:
+                    jar_src["doc"] = doc_id
+            else:
+                # If incremental mode, we don't record any line that is
+                # before our last recorded doc id.
+                if self.incremental and jar_src["doc"] is None:
+                    continue
+                m_val = re_val.match(line)
+                if m_val is not None:
+                    (gid, aid, version, classifier, ext) = m_val.groups()
+                    ext = ext.strip()
+                    path = "/".join(gid.split("."))
+                    if classifier == "NA" and ext.lower() == "pom":
+                        # If incremental mode, we don't record any line that is
+                        # before our last recorded doc id.
+                        if (
+                            self.incremental
+                            and self.state
+                            and self.state.last_seen_pom
+                            and self.state.last_seen_pom >= doc_id
+                        ):
+                            continue
+                        url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"
+                        url_pom = urljoin(self.BASE_URL, url_path,)
+                        out_pom[url_pom] = doc_id
+                    elif (
+                        classifier.lower() == "sources" or ("src" in classifier)
+                    ) and ext.lower() in ("zip", "jar"):
+                        url_path = (
+                            f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"
+                        )
+                        url_src = urljoin(self.BASE_URL, url_path)
+                        jar_src["gid"] = gid
+                        jar_src["aid"] = aid
+                        jar_src["version"] = version
+                else:
+                    m_time = re_time.match(line)
+                    if m_time is not None and url_src is not None:
+                        time = m_time.group("mtime")
+                        jar_src["time"] = int(time)
+                        logger.debug(f"* Yielding jar {url_src}.")
+                        yield {
+                            "type": "maven",
+                            "url": url_src,
+                            **jar_src,
+                        }
+                        url_src = None
+
+        logger.info(f"Found {len(out_pom)} poms.")
+
+        # Now fetch pom files and scan them for scm info.
+
+        logger.info("Fetching poms..")
+        for pom in out_pom:
+            text = self.page_request(pom, {})
+            try:
+                project = xmltodict.parse(text.content.decode())
+                if "scm" in project["project"]:
+                    if "connection" in project["project"]["scm"]:
+                        scm = project["project"]["scm"]["connection"]
+                        gid = project["project"]["groupId"]
+                        aid = project["project"]["artifactId"]
+                        yield {
+                            "type": "scm",
+                            "doc": out_pom[pom],
+                            "url": scm,
+                            "project": f"{gid}.{aid}",
+                        }
+                    else:
+                        logger.debug(f"No scm.connection in pom {pom}")
+                else:
+                    logger.debug(f"No scm in pom {pom}")
+            except xmltodict.expat.ExpatError as error:
+                logger.info(f"Could not parse POM {pom} XML: {error}. Next.")
+
+    def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
+        """Convert a page of Maven repositories into a list of ListedOrigins.
+
+        """
+        assert self.lister_obj.id is not None
+        if page["type"] == "scm":
+            # If origin is a scm url: detect scm type and yield.
+            # Note that the official format is:
+            # scm:git:git://github.com/openengsb/openengsb-framework.git
+            # but many, many projects directly put the repo url, so we have to
+            # detect the content to match it properly.
+            m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
+            if m_scm is not None:
+                scm_type = m_scm.group("type")
+                scm_url = m_scm.group("url")
+                origin = ListedOrigin(
+                    lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type,
+                )
+                yield origin
+            else:
+                if page["url"].endswith(".git"):
+                    origin = ListedOrigin(
+                        lister_id=self.lister_obj.id, url=page["url"], visit_type="git",
+                    )
+                    yield origin
+        else:
+            # Origin is a source archive:
+            origin = ListedOrigin(
+                lister_id=self.lister_obj.id,
+                url=page["url"],
+                visit_type=page["type"],
+                extra_loader_arguments={
+                    "artifacts": [
+                        {
+                            "time": page["time"],
+                            "gid": page["gid"],
+                            "aid": page["aid"],
+                            "version": page["version"],
+                        }
+                    ]
+                },
+            )
+            yield origin
+
+    def commit_page(self, page: RepoPage) -> None:
+        """Update currently stored state using the latest listed doc.
+
+        Note: this is a noop for full listing mode
+
+        """
+        if self.incremental and self.state:
+            # We need to differentiate the two state counters according
+            # to the type of origin.
+            if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:
+                self.state.last_seen_doc = page["doc"]
+            elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:
+                self.state.last_seen_doc = page["doc"]
+                self.state.last_seen_pom = page["doc"]
+
+    def finalize(self) -> None:
+        """Finalize the lister state, set update if any progress has been made.
+
+        Note: this is a noop for full listing mode
+
+        """
+        if self.incremental and self.state:
+            last_seen_doc = self.state.last_seen_doc
+            last_seen_pom = self.state.last_seen_pom
+
+            scheduler_state = self.get_state_from_scheduler()
+            if last_seen_doc and last_seen_pom:
+                if (scheduler_state.last_seen_doc < last_seen_doc) or (
+                    scheduler_state.last_seen_pom < last_seen_pom
+                ):
+                    self.updated = True
diff --git a/swh/lister/maven/tasks.py b/swh/lister/maven/tasks.py
new file mode 100644
index 0000000..928393a
--- /dev/null
+++ b/swh/lister/maven/tasks.py
@@ -0,0 +1,28 @@
+# Copyright (C) 2021 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Dict
+
+from celery import shared_task
+
+from .lister import MavenLister
+
+
+@shared_task(name=__name__ + ".FullMavenLister")
+def list_maven_full(**lister_args) -> Dict[str, int]:
+    """Full update of a Maven repository instance"""
+    lister = MavenLister.from_configfile(incremental=False, **lister_args)
+    return lister.run().dict()
+
+
+@shared_task(name=__name__ + ".IncrementalMavenLister")
+def list_maven_incremental(**lister_args) -> Dict[str, int]:
+    """Incremental update of a Maven repository instance"""
+    lister = MavenLister.from_configfile(incremental=True, **lister_args)
+    return lister.run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping() -> str:
+    return "OK"
diff --git a/swh/lister/maven/tests/__init__.py b/swh/lister/maven/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/swh/lister/maven/tests/data/http_indexes/export.fld b/swh/lister/maven/tests/data/http_indexes/export.fld
new file mode 100755
index 0000000..c8e64b0
--- /dev/null
+++ b/swh/lister/maven/tests/data/http_indexes/export.fld
@@ -0,0 +1,113 @@
+doc 0
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.0|sources|jar
+  field 1
+    name m
+    type string
+    value 1626111735737
+  field 2
+    name i
+    type string
+    value jar|1626109619335|14316|2|2|0|jar
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 1
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.0|NA|pom
+  field 1
+    name m
+    type string
+    value 1626111735764
+  field 2
+    name i
+    type string
+    value jar|1626109636636|-1|1|0|0|pom
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 2
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.1|sources|jar
+  field 1
+    name m
+    type string
+    value 1626111784883
+  field 2
+    name i
+    type string
+    value jar|1626111425534|14510|2|2|0|jar
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 3
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.1|NA|pom
+  field 1
+    name m
+    type string
+    value 1626111784915
+  field 2
+    name i
+    type string
+    value jar|1626111437014|-1|1|0|0|pom
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 4
+  field 14
+    name DESCRIPTOR
+    type string
+    value NexusIndex
+  field 15
+    name IDXINFO
+    type string
+    value 1.0|index
+doc 5
+  field 16
+    name allGroups
+    type string
+    value allGroups
+  field 17
+    name allGroupsList
+    type string
+    value al.aldi
+doc 6
+  field 18
+    name rootGroups
+    type string
+    value rootGroups
+  field 19
+    name rootGroupsList
+    type string
+    value al
+END
+checksum 00000000003321211082
diff --git a/swh/lister/maven/tests/data/http_indexes/export_incr.fld b/swh/lister/maven/tests/data/http_indexes/export_incr.fld
new file mode 100755
index 0000000..875f587
--- /dev/null
+++ b/swh/lister/maven/tests/data/http_indexes/export_incr.fld
@@ -0,0 +1,134 @@
+doc 0
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.0|sources|jar
+  field 1
+    name m
+    type string
+    value 1633786348254
+  field 2
+    name i
+    type string
+    value jar|1626109619335|14316|2|2|0|jar
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 1
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.0|NA|pom
+  field 1
+    name m
+    type string
+    value 1633786348271
+  field 2
+    name i
+    type string
+    value jar|1626109636636|-1|1|0|0|pom
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 2
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.1|sources|jar
+  field 1
+    name m
+    type string
+    value 1633786370818
+  field 2
+    name i
+    type string
+    value jar|1626111425534|14510|2|2|0|jar
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 3
+  field 0
+    name u
+    type string
+    value al.aldi|sprova4j|0.1.1|NA|pom
+  field 1
+    name m
+    type string
+    value 1633786370857
+  field 2
+    name i
+    type string
+    value jar|1626111437014|-1|1|0|0|pom
+  field 10
+    name n
+    type string
+    value sprova4j
+  field 11
+    name d
+    type string
+    value Java client for Sprova Test Management
+doc 4
+  field 0
+    name u
+    type string
+    value com.arangodb|arangodb-graphql|1.2|NA|pom
+  field 1
+    name m
+    type string
+    value 1634498235946
+  field 2
+    name i
+    type string
+    value jar|1624265143830|-1|0|0|0|pom
+  field 10
+    name n
+    type string
+    value arangodb-graphql
+  field 11
+    name d
+    type string
+    value ArangoDB Graphql
+doc 5
+  field 14
+    name DESCRIPTOR
+    type string
+    value NexusIndex
+  field 15
+    name IDXINFO
+    type string
+    value 1.0|index_1
+doc 6
+  field 16
+    name allGroups
+    type string
+    value allGroups
+  field 17
+    name allGroupsList
+    type string
+    value com.arangodb|al.aldi
+doc 7
+  field 18
+    name rootGroups
+    type string
+    value rootGroups
+  field 19
+    name rootGroupsList
+    type string
+    value com|al
+END
+checksum 00000000004102281591
diff --git a/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom b/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom
new file mode 100755
index 0000000..7a9b788
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom
@@ -0,0 +1,208 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ DISCLAIMER
+  ~ Copyright 2019 ArangoDB GmbH, Cologne, Germany
+  ~
+  ~ Licensed under the Apache License, Version 2.0 (the "License");
+  ~ you may not use this file except in compliance with the License.
+  ~ You may obtain a copy of the License at
+  ~
+  ~      http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  ~
+  ~ Copyright holder is ArangoDB GmbH, Cologne, Germany
+  ~
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.arangodb</groupId>
+    <artifactId>arangodb-graphql</artifactId>
+    <version>1.2</version>
+
+    <name>arangodb-graphql</name>
+    <description>ArangoDB Graphql</description>
+    <url>https://github.com/ArangoDB-Community/arangodb-graphql-java</url>
+
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <developers>
+        <developer>
+            <name>Colin Findlay</name>
+        </developer>
+        <developer>
+            <name>Michele Rastelli</name>
+            <url>https://github.com/rashtao</url>
+        </developer>
+    </developers>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <java.version>1.8</java.version>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.sonatype.plugins</groupId>
+                <artifactId>nexus-staging-maven-plugin</artifactId>
+                <version>1.6.8</version>
+                <extensions>true</extensions>
+                <configuration>
+                    <serverId>ossrh</serverId>
+                    <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+                    <stagingProfileId>84aff6e87e214c</stagingProfileId>
+                    <autoReleaseAfterClose>false</autoReleaseAfterClose>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.1.0</version>
+                <configuration>
+                    <encoding>UTF-8</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>3.1.0</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <version>3.1.1</version>
+                <executions>
+                    <execution>
+                        <id>attach-javadocs</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <version>2.8.2</version>
+                <configuration>
+                    <uniqueVersion>false</uniqueVersion>
+                    <retryFailedDeploymentCount>10</retryFailedDeploymentCount>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-gpg-plugin</artifactId>
+                <version>1.6</version>
+                <executions>
+                    <execution>
+                        <id>sign-artifacts</id>
+                        <phase>verify</phase>
+                        <goals>
+                            <goal>sign</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>assembly</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <finalName>
+                        ${project.artifactId}-${project.version}-standalone
+                    </finalName>
+                    <attach>false</attach>
+                    <appendAssemblyId>false</appendAssemblyId>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.graphql-java</groupId>
+            <artifactId>graphql-java</artifactId>
+            <version>11.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.arangodb</groupId>
+            <artifactId>arangodb-java-driver</artifactId>
+            <version>6.5.0</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>2.15.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.hamcrest</groupId>
+            <artifactId>hamcrest-library</artifactId>
+            <version>1.3</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <distributionManagement>
+        <snapshotRepository>
+            <id>ossrh</id>
+            <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+        </snapshotRepository>
+        <repository>
+            <id>ossrh</id>
+            <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+        </repository>
+    </distributionManagement>
+
+    <scm>
+        <url>https://github.com/ArangoDB-Community/arangodb-graphql-java</url>
+        <connection>scm:git:git://github.com/ArangoDB-Community/arangodb-graphql-java.git</connection>
+        <developerConnection>scm:git:git://github.com/ArangoDB-Community/arangodb-graphql-java.git</developerConnection>
+    </scm>
+
+    <organization>
+        <name>ArangoDB GmbH</name>
+        <url>https://www.arangodb.com</url>
+    </organization>
+
+</project>
\ No newline at end of file
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
new file mode 100644
index 0000000..bc1a35b
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>al.aldi</groupId>
+  <artifactId>sprova4j</artifactId>
+  <version>0.1.0</version>
+  <name>sprova4j</name>
+  <description>Java client for Sprova Test Management</description>
+  <url>https://github.com/aldialimucaj/sprova4j</url>
+  <inceptionYear>2018</inceptionYear>
+  <licenses>
+    <license>
+      <name>The Apache Software License, Version 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+  <developers>
+    <developer>
+      <id>aldi</id>
+      <name>Aldi Alimucaj</name>
+      <email>aldi.alimucaj@gmail.com</email>
+    </developer>
+  </developers>
+  <scm>
+    <connection>scm:git:git://github.com/aldialimucaj/sprova4j.git</connection>
+    <developerConnection>scm:git:git://github.com/aldialimucaj/sprova4j.git</developerConnection>
+    <url>https://github.com/aldialimucaj/sprova4j</url>
+  </scm>
+  <dependencies>
+    <dependency>
+      <groupId>ch.qos.logback</groupId>
+      <artifactId>logback-classic</artifactId>
+      <version>1.2.3</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+      <version>2.8.3</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>okhttp</artifactId>
+      <version>3.10.0</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okio</groupId>
+      <artifactId>okio</artifactId>
+      <version>1.0.0</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.glassfish</groupId>
+      <artifactId>javax.json</artifactId>
+      <version>1.1.2</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.json</groupId>
+      <artifactId>javax.json-api</artifactId>
+      <version>1.1.2</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.validation</groupId>
+      <artifactId>validation-api</artifactId>
+      <version>2.0.1.Final</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.12</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>mockwebserver</artifactId>
+      <version>3.10.0</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
new file mode 100644
index 0000000..05e5a71
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>al.aldi</groupId>
+  <artifactId>sprova4j</artifactId>
+  <version>0.1.1</version>
+  <name>sprova4j</name>
+  <description>Java client for Sprova Test Management</description>
+  <url>https://github.com/aldialimucaj/sprova4j</url>
+  <inceptionYear>2018</inceptionYear>
+  <licenses>
+    <license>
+      <name>The Apache Software License, Version 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+  <developers>
+    <developer>
+      <id>aldi</id>
+      <name>Aldi Alimucaj</name>
+      <email>aldi.alimucaj@gmail.com</email>
+    </developer>
+  </developers>
+  <scm>
+    <connection>https://github.com/aldialimucaj/sprova4j.git</connection>
+    <developerConnection>https://github.com/aldialimucaj/sprova4j.git</developerConnection>
+    <url>https://github.com/aldialimucaj/sprova4j</url>
+  </scm>
+  <dependencies>
+    <dependency>
+      <groupId>ch.qos.logback</groupId>
+      <artifactId>logback-classic</artifactId>
+      <version>1.2.3</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+      <version>2.8.5</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>okhttp</artifactId>
+      <version>3.10.0</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okio</groupId>
+      <artifactId>okio</artifactId>
+      <version>1.14.1</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.glassfish</groupId>
+      <artifactId>javax.json</artifactId>
+      <version>1.1.2</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.json</groupId>
+      <artifactId>javax.json-api</artifactId>
+      <version>1.1.2</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.validation</groupId>
+      <artifactId>validation-api</artifactId>
+      <version>2.0.1.Final</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.12</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>mockwebserver</artifactId>
+      <version>3.10.0</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
new file mode 100644
index 0000000..36a214d
--- /dev/null
+++ b/swh/lister/maven/tests/test_lister.py
@@ -0,0 +1,252 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+
+import pytest
+import requests
+
+from swh.lister.maven.lister import MavenLister
+
+MVN_URL = "https://repo1.maven.org/maven2/"  # main maven repo url
+INDEX_URL = "http://indexes/export.fld"  # index directory url
+
+URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
+URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
+URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
+
+LIST_GIT = (
+    "git://github.com/aldialimucaj/sprova4j.git",
+    "https://github.com/aldialimucaj/sprova4j.git",
+)
+
+LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
+
+LIST_SRC = (
+    MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar",
+    MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar",
+)
+
+LIST_SRC_DATA = (
+    {
+        "type": "maven",
+        "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
+        + "/0.1.0/sprova4j-0.1.0-sources.jar",
+        "time": 1626109619335,
+        "gid": "al.aldi",
+        "aid": "sprova4j",
+        "version": "0.1.0",
+    },
+    {
+        "type": "maven",
+        "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
+        + "/0.1.1/sprova4j-0.1.1-sources.jar",
+        "time": 1626111425534,
+        "gid": "al.aldi",
+        "aid": "sprova4j",
+        "version": "0.1.1",
+    },
+)
+
+
+@pytest.fixture
+def maven_index(datadir) -> str:
+    text = Path(datadir, "http_indexes", "export.fld").read_text()
+    return text
+
+
+@pytest.fixture
+def maven_index_incr(datadir) -> str:
+    text = Path(datadir, "http_indexes", "export_incr.fld").read_text()
+    return text
+
+
+@pytest.fixture
+def maven_pom_1(datadir) -> str:
+    text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text()
+    return text
+
+
+@pytest.fixture
+def maven_pom_2(datadir) -> str:
+    text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text()
+    return text
+
+
+@pytest.fixture
+def maven_pom_3(datadir) -> str:
+    text = Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text()
+    return text
+
+
+def test_maven_full_listing(
+    swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2,
+):
+    """Covers full listing of multiple pages, checking page results and listed
+    origins, statelessness."""
+
+    lister = MavenLister(
+        scheduler=swh_scheduler,
+        url=MVN_URL,
+        instance="maven.org",
+        index_url=INDEX_URL,
+        incremental=False,
+    )
+
+    # Set up test.
+    index_text = maven_index
+    requests_mock.get(INDEX_URL, text=index_text)
+    requests_mock.get(URL_POM_1, text=maven_pom_1)
+    requests_mock.get(URL_POM_2, text=maven_pom_2)
+
+    # Then run the lister.
+    stats = lister.run()
+
+    # Start test checks.
+    assert stats.pages == 4
+    assert stats.origins == 4
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    origin_urls = [origin.url for origin in scheduler_origins]
+    assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC)
+
+    for origin in scheduler_origins:
+        if origin.visit_type == "maven":
+            for src in LIST_SRC_DATA:
+                if src.get("url") == origin.url:
+                    artifact = origin.extra_loader_arguments["artifacts"][0]
+                    assert src.get("time") == artifact["time"]
+                    assert src.get("gid") == artifact["gid"]
+                    assert src.get("aid") == artifact["aid"]
+                    assert src.get("version") == artifact["version"]
+                    break
+            else:
+                raise AssertionError
+    scheduler_state = lister.get_state_from_scheduler()
+    assert scheduler_state is not None
+    assert scheduler_state.last_seen_doc == -1
+    assert scheduler_state.last_seen_pom == -1
+
+
+def test_maven_incremental_listing(
+    swh_scheduler,
+    requests_mock,
+    mocker,
+    maven_index,
+    maven_index_incr,
+    maven_pom_1,
+    maven_pom_2,
+    maven_pom_3,
+):
+    """Covers full listing of multiple pages, checking page results and listed
+    origins, with a second updated run for statefulness."""
+
+    lister = MavenLister(
+        scheduler=swh_scheduler,
+        url=MVN_URL,
+        instance="maven.org",
+        index_url=INDEX_URL,
+        incremental=True,
+    )
+
+    # Set up test.
+    requests_mock.get(INDEX_URL, text=maven_index)
+    requests_mock.get(URL_POM_1, text=maven_pom_1)
+    requests_mock.get(URL_POM_2, text=maven_pom_2)
+
+    # Then run the lister.
+    stats = lister.run()
+
+    # Start test checks.
+    assert lister.incremental
+    assert lister.updated
+    assert stats.pages == 4
+    assert stats.origins == 4
+
+    # Second execution of the lister, incremental mode
+    lister = MavenLister(
+        scheduler=swh_scheduler,
+        url=MVN_URL,
+        instance="maven.org",
+        index_url=INDEX_URL,
+        incremental=True,
+    )
+
+    scheduler_state = lister.get_state_from_scheduler()
+    assert scheduler_state is not None
+    assert scheduler_state.last_seen_doc == 3
+    assert scheduler_state.last_seen_pom == 3
+
+    # Set up test.
+    requests_mock.get(INDEX_URL, text=maven_index_incr)
+    requests_mock.get(URL_POM_3, text=maven_pom_3)
+
+    # Then run the lister.
+    stats = lister.run()
+
+    # Start test checks.
+    assert lister.incremental
+    assert lister.updated
+    assert stats.pages == 1
+    assert stats.origins == 1
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    origin_urls = [origin.url for origin in scheduler_origins]
+    assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR)
+
+    for origin in scheduler_origins:
+        if origin.visit_type == "maven":
+            for src in LIST_SRC_DATA:
+                if src.get("url") == origin.url:
+                    artifact = origin.extra_loader_arguments["artifacts"][0]
+                    assert src.get("time") == artifact["time"]
+                    assert src.get("gid") == artifact["gid"]
+                    assert src.get("aid") == artifact["aid"]
+                    assert src.get("version") == artifact["version"]
+                    break
+            else:
+                raise AssertionError
+
+    scheduler_state = lister.get_state_from_scheduler()
+    assert scheduler_state is not None
+    assert scheduler_state.last_seen_doc == 4
+    assert scheduler_state.last_seen_pom == 4
+
+
+@pytest.mark.parametrize("http_code", [400, 404, 500, 502])
+def test_maven_list_http_error(
+    swh_scheduler, requests_mock, mocker, maven_index, http_code
+):
+    """Test handling of some common HTTP errors:
+    - 400: Bad request.
+    - 404: Resource no found.
+    - 500: Internal server error.
+    - 502: Bad gateway ou proxy Error.
+    """
+
+    lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
+
+    # Test failure of index retrieval.
+
+    requests_mock.get(INDEX_URL, status_code=http_code)
+
+    with pytest.raises(requests.HTTPError):
+        lister.run()
+
+    # Test failure of artefacts retrieval.
+
+    requests_mock.get(INDEX_URL, text=maven_index)
+    requests_mock.get(URL_POM_1, status_code=http_code)
+
+    with pytest.raises(requests.HTTPError):
+        lister.run()
+
+    # If the maven_index step succeeded but not the get_pom step,
+    # then we get only the 2 maven-jar origins (and not the 2 additional
+    # src origins).
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+    assert len(scheduler_origins) == 2
diff --git a/swh/lister/maven/tests/test_tasks.py b/swh/lister/maven/tests/test_tasks.py
new file mode 100644
index 0000000..864c00d
--- /dev/null
+++ b/swh/lister/maven/tests/test_tasks.py
@@ -0,0 +1,45 @@
+# Copyright (C) 2021  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.lister.pattern import ListerStats
+
+
+def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+    res = swh_scheduler_celery_app.send_task("swh.lister.maven.tasks.ping")
+    assert res
+    res.wait()
+    assert res.successful()
+    assert res.result == "OK"
+
+
+@pytest.mark.parametrize(
+    "task_name,incremental",
+    [("IncrementalMavenLister", True), ("FullMavenLister", False)],
+)
+def test_task_lister_maven(
+    task_name,
+    incremental,
+    swh_scheduler_celery_app,
+    swh_scheduler_celery_worker,
+    mocker,
+):
+    lister = mocker.patch("swh.lister.maven.tasks.MavenLister")
+    lister.from_configfile.return_value = lister
+    lister.run.return_value = ListerStats(pages=10, origins=500)
+
+    kwargs = dict(
+        url="https://repo1.maven.org/maven2/", index_url="http://indexes/export.fld"
+    )
+    res = swh_scheduler_celery_app.send_task(
+        f"swh.lister.maven.tasks.{task_name}", kwargs=kwargs,
+    )
+    assert res
+    res.wait()
+    assert res.successful()
+
+    lister.from_configfile.assert_called_once_with(incremental=incremental, **kwargs)
+    lister.run.assert_called_once_with()
diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py
index 0079d7a..b39c501 100644
--- a/swh/lister/opam/tests/test_lister.py
+++ b/swh/lister/opam/tests/test_lister.py
@@ -1,170 +1,170 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import io
 import os
 from tempfile import mkdtemp
 from unittest.mock import MagicMock
 
 import pytest
 
 from swh.lister.opam.lister import OpamLister, opam_init
 
 module_name = "swh.lister.opam.lister"
 
 
 @pytest.fixture
 def mock_opam(mocker):
     """Fixture to bypass the actual opam calls within the test context.
 
     """
     # inhibits the real `subprocess.call` which prepares the required internal opam
     # state
     mock_init = mocker.patch(f"{module_name}.call", return_value=None)
     # replaces the real Popen with a fake one (list origins command)
     mocked_popen = MagicMock()
     mocked_popen.stdout = io.BytesIO(b"bar\nbaz\nfoo\n")
     mock_open = mocker.patch(f"{module_name}.Popen", return_value=mocked_popen)
     return mock_init, mock_open
 
 
 def test_mock_init_repository_init(mock_opam, tmp_path, datadir):
     """Initializing opam root directory with an instance should be ok
 
     """
     mock_init, mock_popen = mock_opam
 
     instance = "fake"
     instance_url = f"file://{datadir}/{instance}"
     opam_root = str(tmp_path / "test-opam")
     assert not os.path.exists(opam_root)
 
     # This will initialize an opam directory with the instance
     opam_init(opam_root, instance, instance_url, {})
 
     assert mock_init.called
 
 
 def test_mock_init_repository_update(mock_opam, tmp_path, datadir):
     """Updating opam root directory with another instance should be ok
 
     """
     mock_init, mock_popen = mock_opam
 
     instance = "fake_opam_repo"
     instance_url = f"file://{datadir}/{instance}"
     opam_root = str(tmp_path / "test-opam")
 
     os.makedirs(opam_root, exist_ok=True)
     with open(os.path.join(opam_root, "opam"), "w") as f:
         f.write("one file to avoid empty folder")
 
     assert os.path.exists(opam_root)
     assert os.listdir(opam_root) == ["opam"]  # not empty
     # This will update the repository opam with another instance
     opam_init(opam_root, instance, instance_url, {})
 
     assert mock_init.called
 
 
 def test_lister_opam_optional_instance(swh_scheduler):
     """Instance name should be optional and default to be built out of the netloc."""
     netloc = "opam.ocaml.org"
     instance_url = f"https://{netloc}"
 
     lister = OpamLister(swh_scheduler, url=instance_url,)
     assert lister.instance == netloc
     assert lister.opam_root == "/tmp/opam/"
 
 
-def test_urls(swh_scheduler, mock_opam):
+def test_urls(swh_scheduler, mock_opam, tmp_path):
     mock_init, mock_popen = mock_opam
     instance_url = "https://opam.ocaml.org"
-    tmp_folder = mkdtemp(prefix="swh_opam_lister")
+    tmp_folder = mkdtemp(dir=tmp_path, prefix="swh_opam_lister")
 
     lister = OpamLister(
         swh_scheduler, url=instance_url, instance="opam", opam_root=tmp_folder,
     )
     assert lister.instance == "opam"
     assert lister.opam_root == tmp_folder
 
     # call the lister and get all listed origins urls
     stats = lister.run()
 
     assert mock_init.called
     assert mock_popen.called
 
     assert stats.pages == 3
     assert stats.origins == 3
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     expected_urls = [
         f"opam+{instance_url}/packages/bar/",
         f"opam+{instance_url}/packages/baz/",
         f"opam+{instance_url}/packages/foo/",
     ]
 
     result_urls = [origin.url for origin in scheduler_origins]
 
     assert expected_urls == result_urls
 
 
-def test_opam_binary(datadir, swh_scheduler):
+def test_opam_binary(datadir, swh_scheduler, tmp_path):
     instance_url = f"file://{datadir}/fake_opam_repo"
 
     lister = OpamLister(
         swh_scheduler,
         url=instance_url,
         instance="fake",
-        opam_root=mkdtemp(prefix="swh_opam_lister"),
+        opam_root=mkdtemp(dir=tmp_path, prefix="swh_opam_lister"),
     )
 
     stats = lister.run()
 
     assert stats.pages == 4
     assert stats.origins == 4
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     expected_urls = [
         f"opam+{instance_url}/packages/agrid/",
         f"opam+{instance_url}/packages/calculon/",
         f"opam+{instance_url}/packages/directories/",
         f"opam+{instance_url}/packages/ocb/",
     ]
 
     result_urls = [origin.url for origin in scheduler_origins]
 
     assert expected_urls == result_urls
 
 
-def test_opam_multi_instance(datadir, swh_scheduler):
+def test_opam_multi_instance(datadir, swh_scheduler, tmp_path):
     instance_url = f"file://{datadir}/fake_opam_repo"
 
     lister = OpamLister(
         swh_scheduler,
         url=instance_url,
         instance="fake",
-        opam_root=mkdtemp(prefix="swh_opam_lister"),
+        opam_root=mkdtemp(dir=tmp_path, prefix="swh_opam_lister"),
     )
 
     stats = lister.run()
 
     assert stats.pages == 4
     assert stats.origins == 4
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     expected_urls = [
         f"opam+{instance_url}/packages/agrid/",
         f"opam+{instance_url}/packages/calculon/",
         f"opam+{instance_url}/packages/directories/",
         f"opam+{instance_url}/packages/ocb/",
     ]
 
     result_urls = [origin.url for origin in scheduler_origins]
 
     assert expected_urls == result_urls
diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
index 34edeec..6a519c4 100644
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -1,368 +1,370 @@
 # Copyright (C) 2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 from dataclasses import dataclass, field
 import datetime
 from enum import Enum
 import logging
 import re
 from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
 from xml.etree import ElementTree
 
 import iso8601
 import requests
 from tenacity.before_sleep import before_sleep_log
 
 from swh.core.api.classes import stream_results
 from swh.lister.utils import retry_policy_generic, throttling_retry
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 class VcsNames(Enum):
     """Used to filter SourceForge tool names for valid VCS types"""
 
     # CVS projects are read-only
     CVS = "cvs"
     GIT = "git"
     SUBVERSION = "svn"
     MERCURIAL = "hg"
     BAZAAR = "bzr"
 
 
 VCS_NAMES = set(v.value for v in VcsNames.__members__.values())
 
 
 @dataclass
 class SourceForgeListerEntry:
     vcs: VcsNames
     url: str
     last_modified: datetime.date
 
 
 SubSitemapNameT = str
 ProjectNameT = str
 # SourceForge only offers day-level granularity, which is good enough for our purposes
 LastModifiedT = datetime.date
 
 
 @dataclass
 class SourceForgeListerState:
     """Current state of the SourceForge lister in incremental runs
     """
 
     """If the subsitemap does not exist, we assume a full run of this subsitemap
     is needed. If the date is the same, we skip the subsitemap, otherwise we
     request the subsitemap and look up every project's "last modified" date
     to compare against `ListedOrigins` from the database."""
     subsitemap_last_modified: Dict[SubSitemapNameT, LastModifiedT] = field(
         default_factory=dict
     )
     """Some projects (not the majority, but still meaningful) have no VCS for us to
     archive. We need to remember a mapping of their API URL to their "last modified"
     date so we don't keep querying them needlessly every time."""
     empty_projects: Dict[str, LastModifiedT] = field(default_factory=dict)
 
 
 SourceForgeListerPage = List[SourceForgeListerEntry]
 
 MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
 SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
 
 # API resource endpoint for information about the given project.
 #
 # `namespace`: Project namespace. Very often `p`, but can be something else like
 #              `adobe`
 # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`.
 PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}"
 
 # Predictable URL for cloning (in the broad sense) a VCS registered for the project.
 #
 # `vcs`: VCS type, one of `VCS_NAMES`
 # `namespace`: Project namespace. Very often `p`, but can be something else like
 #              `adobe`.
 # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`.
 # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses
 #                `git` (https://git.code.sf.net/p/codeblocks/git).
 CLONE_URL_FORMAT = "https://{vcs}.code.sf.net/{namespace}/{project}/{mount_point}"
 
 PROJ_URL_RE = re.compile(
     r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?"
 )
 
 # Mapping of `(namespace, project name)` to `last modified` date.
 ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT]
 
 
 class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
     """List origins from the "SourceForge" forge.
 
     """
 
     # Part of the lister API, that identifies this lister
     LISTER_NAME = "sourceforge"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         incremental: bool = False,
         credentials: Optional[CredentialsType] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             url="https://sourceforge.net",
             instance="main",
             credentials=credentials,
         )
 
         # Will hold the currently saved "last modified" dates to compare against our
         # requests.
         self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
         self.session = requests.Session()
         # Declare the USER_AGENT is more sysadm-friendly for the forge we list
         self.session.headers.update(
             {"Accept": "application/json", "User-Agent": USER_AGENT}
         )
         self.incremental = incremental
 
     def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
         subsitemaps = {
             k: datetime.date.fromisoformat(v)
             for k, v in d.get("subsitemap_last_modified", {}).items()
         }
         empty_projects = {
             k: datetime.date.fromisoformat(v)
             for k, v in d.get("empty_projects", {}).items()
         }
         return SourceForgeListerState(
             subsitemap_last_modified=subsitemaps, empty_projects=empty_projects
         )
 
     def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]:
         return {
             "subsitemap_last_modified": {
                 k: v.isoformat() for k, v in state.subsitemap_last_modified.items()
             },
             "empty_projects": {
                 k: v.isoformat() for k, v in state.empty_projects.items()
             },
         }
 
     def projects_last_modified(self) -> ProjectsLastModifiedCache:
         if not self.incremental:
             # No point in loading the previous results if we're doing a full run
             return {}
         if self._project_last_modified is not None:
             return self._project_last_modified
         # We know there will be at least that many origins
         stream = stream_results(
             self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000
         )
         listed_origins = dict()
         # Projects can have slashes in them if they're subprojects, but the
         # mointpoint (last component) cannot.
         url_match = re.compile(
             r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*"
         )
         for origin in stream:
             url = origin.url
             match = url_match.match(url)
             assert match is not None
             matches = match.groupdict()
             namespace = matches["namespace"]
             project = matches["project"]
             # "Last modified" dates are the same across all VCS (tools, even)
             # within a project or subproject. An assertion here would be overkill.
             last_modified = origin.last_update
             assert last_modified is not None
             listed_origins[(namespace, project)] = last_modified.date()
 
         self._project_last_modified = listed_origins
         return listed_origins
 
     @throttling_retry(
         retry=retry_policy_generic,
         before_sleep=before_sleep_log(logger, logging.WARNING),
     )
     def page_request(self, url, params) -> requests.Response:
         # Log listed URL to ease debugging
         logger.debug("Fetching URL %s with params %s", url, params)
         response = self.session.get(url, params=params)
 
         if response.status_code != 200:
             # Log response content to ease debugging
             logger.warning(
                 "Unexpected HTTP status code %s for URL %s",
                 response.status_code,
                 response.url,
             )
         # The lister must fail on blocking errors
         response.raise_for_status()
 
         return response
 
     def get_pages(self) -> Iterator[SourceForgeListerPage]:
         """
         SourceForge has a main XML sitemap that lists its sharded sitemaps for all
         projects.
         Each XML sub-sitemap lists project pages, which are not unique per project: a
         project can have a wiki, a home, a git, an svn, etc.
         For each unique project, we query an API endpoint that lists (among
         other things) the tools associated with said project, some of which are
         the VCS used. Subprojects are considered separate projects.
         Lastly we use the information of which VCS are used to build the predictable
         clone URL for any given VCS.
         """
         sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text
         tree = ElementTree.fromstring(sitemap_contents)
 
         for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
             last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
             assert last_modified_el is not None and last_modified_el.text is not None
             last_modified = datetime.date.fromisoformat(last_modified_el.text)
             location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
             assert location is not None and location.text is not None
             sub_url = location.text
 
             if self.incremental:
                 recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)
                 if recorded_last_mod == last_modified:
                     # The entire subsitemap hasn't changed, so none of its projects
                     # have either, skip it.
                     continue
 
             self.state.subsitemap_last_modified[sub_url] = last_modified
             subsitemap_contents = self.page_request(sub_url, {}).text
             subtree = ElementTree.fromstring(subsitemap_contents)
 
             yield from self._get_pages_from_subsitemap(subtree)
 
     def get_origins_from_page(
         self, page: SourceForgeListerPage
     ) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
         for hit in page:
+            last_modified: str = str(hit.last_modified)
+            last_update: datetime.datetime = iso8601.parse_date(last_modified)
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 visit_type=hit.vcs.value,
                 url=hit.url,
-                last_update=iso8601.parse_date(hit.last_modified),
+                last_update=last_update,
             )
 
     def _get_pages_from_subsitemap(
         self, subtree: ElementTree.Element
     ) -> Iterator[SourceForgeListerPage]:
         projects: Set[ProjectNameT] = set()
         for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
             last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
             assert last_modified_block is not None
             last_modified = last_modified_block.text
             location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc")
             assert location is not None
             project_url = location.text
             assert project_url is not None
 
             match = PROJ_URL_RE.match(project_url)
             if match:
                 matches = match.groupdict()
                 namespace = matches["namespace"]
                 if namespace == "projects":
                     # These have a `p`-namespaced counterpart, use that instead
                     continue
 
                 project = matches["project"]
                 rest = matches["rest"]
                 if rest.count("/") > 1:
                     # This is a subproject. There exists no sub-subprojects.
                     subproject_name = rest.rsplit("/", 2)[0]
                     project = f"{project}/{subproject_name}"
 
                 prev_len = len(projects)
                 projects.add(project)
 
                 if prev_len == len(projects):
                     # Already seen
                     continue
 
                 pages = self._get_pages_for_project(namespace, project, last_modified)
                 if pages:
                     yield pages
                 else:
                     logger.debug("Project '%s' does not have any VCS", project)
             else:
                 # Should almost always match, let's log it
                 # The only ones that don't match are mostly specialized one-off URLs.
                 msg = "Project URL '%s' does not match expected pattern"
                 logger.warning(msg, project_url)
 
     def _get_pages_for_project(
         self, namespace, project, last_modified
     ) -> SourceForgeListerPage:
         endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project)
         empty_project_last_modified = self.state.empty_projects.get(endpoint)
         if empty_project_last_modified is not None:
             if last_modified == empty_project_last_modified.isoformat():
                 # Project has not changed, so is still empty, meaning it has
                 # no VCS attached that we can archive.
                 logger.debug(f"Project {namespace}/{project} is still empty")
                 return []
 
         if self.incremental:
             expected = self.projects_last_modified().get((namespace, project))
 
             if expected is not None:
                 if expected.isoformat() == last_modified:
                     # Project has not changed
                     logger.debug(f"Project {namespace}/{project} has not changed")
                     return []
                 else:
                     logger.debug(f"Project {namespace}/{project} was updated")
             else:
                 msg = "New project during an incremental run: %s/%s"
                 logger.debug(msg, namespace, project)
 
         try:
             res = self.page_request(endpoint, {}).json()
         except requests.HTTPError:
             # We've already logged in `page_request`
             return []
 
         tools = res.get("tools")
         if tools is None:
             # This rarely happens, on very old URLs
             logger.warning("Project '%s' does not have any tools", endpoint)
             return []
 
         hits = []
         for tool in tools:
             tool_name = tool["name"]
             if tool_name not in VCS_NAMES:
                 continue
             url = CLONE_URL_FORMAT.format(
                 vcs=tool_name,
                 namespace=namespace,
                 project=project,
                 mount_point=tool["mount_point"],
             )
             if tool_name == VcsNames.MERCURIAL.value:
                 # SourceForge does not yet support anonymous HTTPS cloning for Mercurial
                 # See https://sourceforge.net/p/forge/feature-requests/727/
                 url = url.replace("https://", "http://")
             entry = SourceForgeListerEntry(
                 vcs=VcsNames(tool_name), url=url, last_modified=last_modified
             )
             hits.append(entry)
 
         if not hits:
             date = datetime.date.fromisoformat(last_modified)
             self.state.empty_projects[endpoint] = date
         else:
             self.state.empty_projects.pop(endpoint, None)
 
         return hits
diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py
index 4b8e7ae..e2585df 100644
--- a/swh/lister/tests/test_cli.py
+++ b/swh/lister/tests/test_cli.py
@@ -1,44 +1,48 @@
 # Copyright (C) 2019-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import pytest
 
 from swh.lister.cli import SUPPORTED_LISTERS, get_lister
 
 lister_args = {
     "cgit": {"url": "https://git.eclipse.org/c/",},
     "phabricator": {
         "instance": "softwareheritage",
         "url": "https://forge.softwareheritage.org/api/diffusion.repository.search",
         "api_token": "bogus",
     },
     "gitea": {"url": "https://try.gitea.io/api/v1/",},
     "tuleap": {"url": "https://tuleap.net",},
     "gitlab": {"url": "https://gitlab.ow2.org/api/v4", "instance": "ow2",},
     "opam": {"url": "https://opam.ocaml.org", "instance": "opam"},
+    "maven": {
+        "url": "https://repo1.maven.org/maven2/",
+        "index_url": "http://indexes/export.fld",
+    },
 }
 
 
 def test_get_lister_wrong_input():
     """Unsupported lister should raise"""
     with pytest.raises(ValueError) as e:
         get_lister("unknown", "db-url")
 
     assert "Invalid lister" in str(e.value)
 
 
 def test_get_lister(swh_scheduler_config):
     """Instantiating a supported lister should be ok
 
     """
     # Drop launchpad lister from the lister to check, its test setup is more involved
     # than the other listers and it's not currently done here
     for lister_name in SUPPORTED_LISTERS:
         lst = get_lister(
             lister_name,
             scheduler={"cls": "local", **swh_scheduler_config},
             **lister_args.get(lister_name, {}),
         )
         assert hasattr(lst, "run")