diff --git a/PKG-INFO b/PKG-INFO
index 0f81c75..98e33d1 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,127 +1,127 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 2.6.1
+Version: 2.6.2
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
 
 
diff --git a/docs/run_a_new_lister.rst b/docs/run_a_new_lister.rst
index f18f91b..209f420 100644
--- a/docs/run_a_new_lister.rst
+++ b/docs/run_a_new_lister.rst
@@ -1,87 +1,87 @@
 .. _run-lister-tutorial:
 
 Tutorial: run a lister within docker-dev in just a few steps
 ============================================================
 
 It is a good practice to run your new lister in docker-dev. This provides an almost
 production-like environment. Testing the lister in docker dev prior to deployment
 reduces the chances of encountering errors when turning it for production.
 Here are the steps you need to follow to run a lister within your local environment.
 
 
-1. You must edit the docker-compose override file (`docker-compose.override.yml`).
-   following the sample provided ::
+1. You must edit the docker-compose override file (:file:`docker-compose.override.yml`).
+   following the sample provided::
 
         version: '2'
 
         services:
         swh-lister:
             volumes:
             - "$SWH_ENVIRONMENT_HOME/swh-lister:/src/swh-lister"
 
-   The file named `docker-compose.override.yml` will automatically be loaded by
+   The file named :file:`docker-compose.override.yml` will automatically be loaded by
    ``docker-compose``.Having an override makes it possible to run a docker container
    with some swh packages installed from sources instead of using the latest
    published packages from pypi. For more details, you may refer to README.md
    present in ``swh-docker-dev``.
 2. Follow the instruction mentioned under heading **Preparation steps** and
    **Configuration file sample** in README.md of swh-lister.
 3. Add in the lister configuration the new ``task_modules`` and ``task_queues``
    entry for the your new lister. You need to amend the docker/conf/lister.yml file to
    add the entries. Here is an example for GNU lister::
 
     celery:
       task_broker: amqp://guest:guest@amqp//
       task_modules:
         ...
         - swh.lister.gnu.tasks
       task_queues:
         ...
         - swh.lister.gnu.tasks.GNUListerTask
 
 4. Make sure to run ``storage (5002)`` and ``scheduler (5008)`` services locally.
    You may use the following command to run docker::
 
     ~/swh-environment/swh-docker-dev$ docker-compose up -d
 
 5. Add the lister task-type in the scheduler.  For example, if you want to
-   add pypi lister task-type ::
+   add pypi lister task-type::
 
     ~/swh-environment$ swh scheduler task-type add list-gnu-full \
         "swh.lister.gnu.tasks.GNUListerTask" "Full GNU lister" \
         --default-interval '1 day' --backoff-factor 1
 
   You can check all the task-type by::
 
     ~/swh-environment$swh scheduler task-type list
     Known task types:
     list-bitbucket-incremental:
       Incrementally list BitBucket
     list-cran:
       Full CRAN Lister
     list-debian-distribution:
       List a Debian distribution
     list-github-full:
       Full update of GitHub repos list
     list-github-incremental:
     ...
 
   If your lister is creating new loading task not yet registered, you need
   to register that task type as well.
 
 6. Run your lister with the help of scheduler cli. You need to add the task in
    the scheduler using its cli. For example, you need to execute this command
    to run gnu lister ::
 
      ~/swh-environment$ swh scheduler --url http://localhost:5008/ task add \
       list-gnu-full --policy oneshot
 
 After the execution of lister is complete, you can see the loading task created::
 
     ~/swh-environment/swh-lister$ swh scheduler task list
 
 You can also check the repositories listed by the lister from the scheduler database
 in which the lister output is stored. To connect to the database::
 
     ~/swh-environment/docker$ docker-compose exec swh-scheduler bash -c \
       'psql swh-scheduler -c "select url from listed_origins"'
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 65a6db0..d4ae380 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -1,382 +1,380 @@
 .. _lister-tutorial:
 
 Tutorial: list the content of your favorite forge in just a few steps
 =====================================================================
 
 Overview
 --------
 
 The three major phases of work in Software Heritage's preservation process, on the
 technical side, are *listing software sources*, *scheduling updates* and *loading the
 software artifacts into the archive*.
 
 A previous effort in 2017 consisted in designing the framework to make lister a
 straightforward "fill in the blanks" process, based on gained experience on the
 diversity found in the listed services. This is the second iteration on the lister
 framework design, comprising a library and an API which is easier to work with and less
 "magic" (read implicit). This new design is part of a larger effort in redesigning the
 scheduling system for the recurring tasks updating the content of the archive.
 
 .. _fundamentals:
 
 Fundamentals
 ------------
 
 Fundamentally, a basic lister must follow these steps:
 
 1. Issue a network request for a service endpoint.
 2. Convert the response data into a model object.
 3. Send the model object to the scheduler.
 
 Steps 1 and 3 are generic problems, that are often already solved by helpers or in other
 listers. That leaves us mainly to implement step 2, which is simple when the remote
 service provides an API.
 
 .. _prerequisites:
 
 Prerequisites
 -------------
 
 Skills:
 
 * object-oriented Python
 * requesting remote services through HTTP
 * scrapping if no API is offered
 
 Analysis of the target service. Prepare the following elements to write the lister:
 
 * instance names and URLs
 * requesting scheme: base URL, path, query_string, POST data, headers
 * authentication types and which one to support, if any
 * rate-limiting: HTTP codes and headers used
 * data format: JSON/XML/HTML/...?
 * mapping between remote data and needed data (ListedOrigin model, internal state)
 
 We will now walk through the steps to build a new lister.
 Please use this template to start with: :download:`new_lister_template.py`
 
 .. _lister-declaration:
 
 Lister declaration
 ------------------
 
 In order to write a lister, two basic elements are required. These are the
 :py:class:`Lister` base class and the :py:class:`ListedOrigin` scheduler model class.
 Optionally, for listers that need to keep a state and support incremental listing, an
 additional object :py:class:`ListerState` will come into play.
 
 Each lister must subclass :py:class:`Lister <swh.lister.pattern.Lister>` either directly
 or through a subclass such as :py:class:`StatelessLister
 <swh.lister.pattern.StatelessLister>` for stateless ones.
 
 We extensively type-annotate our listers, as any new code, which makes proeminent that
 those lister classes are generic, and take the following parameters:
 
 * :py:class:`Lister`: the lister state type, the page type
 * :py:class:`StatelessLister`: only the page type
 
 You can can start by declaring a stateless lister and leave the implementation of state
 for later if the listing needs it. We will see how to in :ref:`handling-lister-state`.
 
 Both the lister state type and the page type are user-defined types. However, while the
 page type may only exist as a type annotation, the state type for a stateful lister must
 be associated with a concrete object. The state type is commonly defined as a dataclass
 whereas the page type is often a mere annotation, potentially given a nice alias.
 
 Example lister declaration::
 
     NewForgePage = List[Dict[str, Any]]
 
     @dataclass
     class NewForgeListerState:
         ...
 
     class NewForgeLister(Lister[NewForgeListerState, NewForgePage]):
         LISTER_NAME = "My"
         ...
 
 The new lister must declare a name through the :py:attr:`LISTER_NAME` class attribute.
 
 .. _lister-construction:
 
 Lister construction
 -------------------
 
 The lister constructor is only required to ask for a :py:class:`SchedulerInterface`
 object to pass to the base class. But it does not mean that it is all that's needed for
 it to useful. A lister need information on which remote service to talk to. It needs an
 URL.
 
 Some services are centralized and offered by a single organization. Think of Github.
 Others are offered by many people across the Internet, each using a different hosting,
 each providing specific data. Think of the many Gitlab instances. We need a name to
 identify each instance, and even if there is only one, we need its URL to access it
 concretely.
 
 Now, you may think of any strategy to infer the information or hardcode it, but the base
 class needs an URL and an instance name. In any case, for a multi-instance service, you
 better be explicit and require the URL as constructor argument. We recommend the URL to
 be some form of a base URL, to be concatenated with any variable part appearing either
 because there exist multiple instances or the URL need recomputation in the listing
 process.
 
 If we need any credentials to access a remote service, and do so in our polite but
 persistent fashion (remember that we want fresh information), you are encouraged to
 provide support for authenticated access. The base class support handling credentials as
 a set of identifier/secret pair. It knows how to load from a secrets store the right
 ones for the current ("lister name", "instance name") setting, if none were originally
 provided through the task parameters. You can ask for other types of access tokens in a
 separate parameter, but then you lose this advantage.
 
 Example of a typical lister constructor::
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         instance: str,
         credentials: CredentialsType = None,
     ):
         super().__init__(
             scheduler=scheduler, url=url, instance=instance, credentials=credentials,
         )
         ...
 
 .. _core-lister-functionality:
 
 Core lister functionality
 -------------------------
 
 For the lister to contribute data to the archive, you now have to write the logic to
 fetch data from the remote service, and format it in the canonical form the scheduler
 expects, as outined in :ref:`fundamentals`. To this purpose, the two methods to
 implement are::
 
     def get_pages(self) -> Iterator[NewForgePage]:
         ...
 
     def get_origins_from_page(self, page: NewForgePage) -> Iterator[ListedOrigin]:
         ...
 
 Those two core functions are called by the principal lister method,
 :py:meth:`Lister.run`, found in the base class.
 
 :py:meth:`get_pages` is the guts of the lister. It takes no arguments and must produce
 data pages. An iterator is fine here, as the :py:meth:`Lister.run` method only mean to
 iterate in a single pass on it. This method gets its input from a network request to a
 remote service's endpoint to retrieve the data we long for.
 
 Depending on whether the data is adequately structured for our purpose can be tricky.
 Here you may have to show off your data scraping skills, or just consume a well-designed
 API. Those aspects are discussed more specifically in the section
 :ref:`handling-specific-topics`.
 
 In any case, we want the data we return to be usefully filtered and structured. The
-easiest way to create an iterator is to use the `yield` keyword. Yield each data page
+easiest way to create an iterator is to use the ``yield`` keyword. Yield each data page
 you have structured in accordance with the page type you have declared. The page type
 exists only for static type checking of data passed from :py:meth:`get_pages` to
 :py:meth:`get_origins_from_page`; you can choose whatever fits the bill.
 
 :py:meth:`get_origins_from_page` is simpler. For each individual software origin you
 have received in the page, you convert and yield a :py:class:`ListedOrigin` model
 object. This datatype has the following mandatory fields:
 
 * lister id: you generally fill this with the value of :py:attr:`self.lister_obj.id`
 
 * visit type: the type of software distribution format the service provides. For use by
   a corresponding loader. It is an identifier, so you have to either use an existing
   value or craft a new one if you get off the beaten track and tackle a new software
   source. But then you will have to discuss the name with the core developers.
 
   Example: Phabricator is a forge that can handle Git or SVN repositories. The visit
   type would be "git" when listing such a repo that provides a Git URL that we can load.
 
 * origin URL: an URL that, combined with the visit type, will serve as the input of
   loader.
 
 This datatype can also further be detailed with the optional fields:
 
 * last update date: freshness information on this origin, which is useful to the
   scheduler for optimizing its scheduling decisions. Fill it if provided by the service,
   at no substantial additional runtime cost, e.g. in the same request.
 
 * extra loader arguments: extra parameters to be passed to the loader for it to be
   able to load the origin. It is needed for example when additional context is needed
   along with the URL to effectively load from the origin.
 
-See the definition of ListedOrigin_.
+See the definition of :swh_web:`ListedOrigin <browse/swh:1:rev:03460207a17d82635ef5a6f12358392143eb9eef/?origin_url=https://forge.softwareheritage.org/source/swh-scheduler.git&path=swh/scheduler/model.py&revision=03460207a17d82635ef5a6f12358392143eb9eef#L134-L177>`.
 
 Now that that we showed how those two methods operate, let's put it together by showing
 how they fit in the principal :py:meth:`Lister.run` method::
 
     def run(self) -> ListerStats:
 
         full_stats = ListerStats()
 
         try:
             for page in self.get_pages():
                 full_stats.pages += 1
                 origins = self.get_origins_from_page(page)
                 full_stats.origins += self.send_origins(origins)
                 self.commit_page(page)
         finally:
             self.finalize()
             if self.updated:
                 self.set_state_in_scheduler()
 
         return full_stats
 
 :py:meth:`Lister.send_origins` is the method that sends listed origins to the scheduler.
 
 The :py:class:`ListerState` datastructure, defined along the base lister class, is used
 to compute the number of listed pages and origins in a single lister run. It is useful
 both for the scheduler that automatically collects this information and to test the
 lister.
 
 You see that the bulk of a lister run consists in streaming data gathered from the
 remote service to the scheduler. And this is done under a ``try...finally`` construct to
 have the lister state reliably recorded in case of unhandled error. We will explain the
 role of the remaining methods and attributes appearing here in the next section as it is
 related to the lister state.
 
-.. _ListedOrigin: https://archive.softwareheritage.org/browse/swh:1:rev:03460207a17d82635ef5a6f12358392143eb9eef/?origin_url=https://forge.softwareheritage.org/source/swh-scheduler.git&path=swh/scheduler/model.py&revision=03460207a17d82635ef5a6f12358392143eb9eef#L134-L177
-
 .. _handling-lister-state:
 
 Handling lister state
 ---------------------
 
 With what we have covered until now you can write a stateless lister. Unfortunately,
 some services provide too much data to efficiently deal with it in a one-shot fashion.
 Listing a given software source can take several hours or days to process. Our listers
 can also give valid output, but fail on an unexpected condition and would have to start
 over. As we want to be able to resume the listing process from a given element, provided
 by the remote service and guaranteed to be ordered, such as a date or a numeric
 identifier, we need to deal with state.
 
 The remaining part of the lister API is reserved for dealing with lister state.
 
 If the service to list has no pagination, then the data set to handle is small enough to
 not require keeping lister state. In the opposite case, you will have to determine which
 piece of information should be recorded in the lister state. As said earlier, we
 recommend declaring a dataclass for the lister state::
 
     @dataclass
     class NewForgeListerState:
         current: str = ""
 
     class NewForgeLister(Lister[NewForgeListerState, NewForgePage]):
         ...
 
 A pair of methods, :py:meth:`state_from_dict` and :py:meth:`state_to_dict` are used to
 respectively import lister state from the scheduler and export lister state to the
 scheduler. Some fields may need help to be serialized to the scheduler, such as dates,
 so this needs to be handled there.
 
 Where is the state used? Taking the general case of a paginating service, the lister
 state is used at the beginning of the :py:meth:`get_pages` method to initialize the
 variables associated with the last listing progress. That way we can start from an
 arbitrary element, or just the first one if there is no last lister state.
 
 The :py:meth:`commit_page` is called on successful page processing, after the new
 origins are sent to the scheduler. Here you should mainly update the lister state by
 taking into account the new page processed, e.g. advance a date or serial field.
 
 Finally, upon either completion or error, the :py:meth:`finalize` is called. There you
 must set attribute :py:attr:`updated` to True if you were successful in advancing in the
 listing process. To do this you will commonly retrieve the latest saved lister state
 from the scheduler and compare with your current lister state. If lister state was
 updated, ultimately the current lister state will be recorded in the scheduler.
 
 We have now seen the stateful lister API. Note that some listers may implement more
 flexibility in the use of lister state. Some allow an `incremental` parameter that
 governs whether or not we will do a stateful listing or not. It is up to you to support
 additional functionality if it seems relevant.
 
 .. _handling-specific-topics:
 
 Handling specific topics
 ------------------------
 
 Here is a quick coverage of common topics left out from lister construction and
 :py:meth:`get_pages` descriptions.
 
 Sessions
 ^^^^^^^^
 
 When requesting a web service repeatedly, most parameters including headers do not
 change and could be set up once initially. We recommend setting up a e.g. HTTP session,
 as instance attribute so that further requesting code can focus on what really changes.
 Some ubiquitous HTTP headers include "Accept" to set to the service response format and
 "User-Agent" for which we provide a recommended value :py:const:`USER_AGENT` to be
 imported from :py:mod:`swh.lister`. Authentication is also commonly provided through
 headers, so you can also set it up in the session.
 
 Transport error handling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 We generally recommend logging every unhandleable error with the response content and
 then immediately stop the listing by doing an equivalent of
-:py:meth:`Response.raise_for_status` from the `requests` library. As for rate-limiting
+:py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting
 errors, we have a strategy of using a flexible decorator to handle the retrying for us.
-It is based on the `tenacity` library and accessible as :py:func:`throttling_retry` from
+It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from
 :py:mod:`swh.lister.utils`.
 
 Pagination
 ^^^^^^^^^^
 
 This one is a moving target. You have to understand how the pagination mechanics of the
 particular service works. Some guidelines though. The identifier may be minimal (an id
 to pass as query parameter), compound (a set of such parameters) or complete (a whole
 URL). If the service provides the next URL, use it. The piece of information may be
 found either in the response body, or in a header. Once identified, you still have to
 implement the logic of requesting and extracting it in a loop and quitting the loop when
 there is no more data to fetch.
 
 Page results
 ^^^^^^^^^^^^
 
 First, when retrieving page results, which involves some protocols and parsing logic,
 please make sure that any deviance from what was expected will result in an
 informational error. You also have to simplify the results, both with filtering request
 parameters if the service supports it, and by extracting from the response only the
 information needed into a structured page. This all makes for easier debugging.
 
 Misc files
-^^^^^^^^^^^^^^^
+^^^^^^^^^^
 
 There are also a few files that need to be modified outside of the lister directory, namely:
 
-* `/setup.py` to add your lister to the end of the list in the *setup* section:
+* :file:`/setup.py` to add your lister to the end of the list in the *setup* section::
 
     entry_points="""
         [swh.cli.subcommands]
         lister=swh.lister.cli
         [swh.workers]
         lister.bitbucket=swh.lister.bitbucket:register
         lister.cgit=swh.lister.cgit:register
         ..."""
 
-* `/swh/lister/tests/test_cli.py` to get a default set of parameters in scheduler-related tests.
-* `/README.md` to reference the new lister.
-* `/CONTRIBUTORS` to add your name.
+* :file:`/swh/lister/tests/test_cli.py` to get a default set of parameters in scheduler-related tests.
+* :file:`/README.md` to reference the new lister.
+* :file:`/CONTRIBUTORS` to add your name.
 
 Testing your lister
 -------------------
 
 When developing a new lister, it's important to test. For this, add the tests
-(check `swh/lister/*/tests/`) and register the celery tasks in the main
-conftest.py (`swh/lister/core/tests/conftest.py`).
+(check :file:`swh/lister/*/tests/`) and register the celery tasks in the main
+conftest.py (:file:`swh/lister/core/tests/conftest.py`).
 
 Another important step is to actually run it within the docker-dev
 (:ref:`run-lister-tutorial`).
 
 More about listers
 ------------------
 
 See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ).
 
 .. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py
 .. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py
 .. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py
 .. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py
diff --git a/requirements-test.txt b/requirements-test.txt
index c73a59f..5463235 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,6 +1,6 @@
-pytest
+pytest < 7.0.0  # v7.0.0 removed _pytest.tmpdir.TempdirFactory, which is used by some of the pytest plugins we use
 pytest-mock
 requests_mock
 types-click
 types-pyyaml
 types-requests
diff --git a/requirements.txt b/requirements.txt
index 8d9bb82..c57eecc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
-tenacity
+tenacity >= 6.2
 xmltodict
diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO
index 0f81c75..98e33d1 100644
--- a/swh.lister.egg-info/PKG-INFO
+++ b/swh.lister.egg-info/PKG-INFO
@@ -1,127 +1,127 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 2.6.1
+Version: 2.6.2
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
 
 
diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt
index d0db159..238486d 100644
--- a/swh.lister.egg-info/SOURCES.txt
+++ b/swh.lister.egg-info/SOURCES.txt
@@ -1,246 +1,247 @@
 .gitignore
 .pre-commit-config.yaml
 ACKNOWLEDGEMENTS
 CODE_OF_CONDUCT.md
 CONTRIBUTORS
 LICENSE
 MANIFEST.in
 Makefile
 README.md
 conftest.py
 mypy.ini
 pyproject.toml
 pytest.ini
 requirements-swh.txt
 requirements-test.txt
 requirements.txt
 setup.cfg
 setup.py
 tox.ini
 docs/.gitignore
 docs/Makefile
 docs/cli.rst
 docs/conf.py
 docs/index.rst
 docs/new_lister_template.py
 docs/run_a_new_lister.rst
 docs/save_forge.rst
 docs/tutorial.rst
 docs/_static/.placeholder
 docs/_templates/.placeholder
 docs/images/new_base.png
 docs/images/new_bitbucket_lister.png
 docs/images/new_github_lister.png
 docs/images/old_github_lister.png
 sql/crawler.sql
 sql/pimp_db.sql
 swh/__init__.py
 swh.lister.egg-info/PKG-INFO
 swh.lister.egg-info/SOURCES.txt
 swh.lister.egg-info/dependency_links.txt
 swh.lister.egg-info/entry_points.txt
 swh.lister.egg-info/requires.txt
 swh.lister.egg-info/top_level.txt
 swh/lister/__init__.py
 swh/lister/cli.py
 swh/lister/pattern.py
 swh/lister/py.typed
 swh/lister/utils.py
 swh/lister/bitbucket/__init__.py
 swh/lister/bitbucket/lister.py
 swh/lister/bitbucket/tasks.py
 swh/lister/bitbucket/tests/__init__.py
 swh/lister/bitbucket/tests/test_lister.py
 swh/lister/bitbucket/tests/test_tasks.py
 swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json
 swh/lister/bitbucket/tests/data/bb_api_repositories_page2.json
 swh/lister/cgit/__init__.py
 swh/lister/cgit/lister.py
 swh/lister/cgit/tasks.py
 swh/lister/cgit/tests/__init__.py
 swh/lister/cgit/tests/repo_list.txt
 swh/lister/cgit/tests/test_lister.py
 swh/lister/cgit/tests/test_tasks.py
 swh/lister/cgit/tests/data/https_git.baserock.org/cgit
 swh/lister/cgit/tests/data/https_git.eclipse.org/c
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/README
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit
 swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit_elisp-es.git
 swh/lister/cgit/tests/data/https_git.tizen/README
 swh/lister/cgit/tests/data/https_git.tizen/cgit
 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=100
 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=50
 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Projects
 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Users
 swh/lister/cgit/tests/data/https_git.tizen/cgit_Lock-Projects
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-base
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-mc1n2
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e3250
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e4x12
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_devices_nfc-plugin-nxp
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_intel_mfld_bootstub-mfld-blackbay
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_mtdev
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_opengl-es-virtual-drv
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libdrm
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libnl
 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_xorg_driver_xserver-xorg-misc
 swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-gallery-efl
 swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-homescreen-efl
 swh/lister/cgit/tests/data/https_jff.email/cgit
 swh/lister/cran/__init__.py
 swh/lister/cran/list_all_packages.R
 swh/lister/cran/lister.py
 swh/lister/cran/tasks.py
 swh/lister/cran/tests/__init__.py
 swh/lister/cran/tests/test_lister.py
 swh/lister/cran/tests/test_tasks.py
 swh/lister/cran/tests/data/list-r-packages.json
 swh/lister/debian/__init__.py
 swh/lister/debian/lister.py
 swh/lister/debian/tasks.py
 swh/lister/debian/tests/__init__.py
 swh/lister/debian/tests/test_lister.py
 swh/lister/debian/tests/test_tasks.py
 swh/lister/debian/tests/data/Sources_bullseye
 swh/lister/debian/tests/data/Sources_buster
 swh/lister/debian/tests/data/Sources_stretch
 swh/lister/gitea/__init__.py
 swh/lister/gitea/lister.py
 swh/lister/gitea/tasks.py
 swh/lister/gitea/tests/__init__.py
 swh/lister/gitea/tests/test_lister.py
 swh/lister/gitea/tests/test_tasks.py
 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1
 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2
 swh/lister/github/__init__.py
 swh/lister/github/lister.py
 swh/lister/github/tasks.py
 swh/lister/github/tests/__init__.py
 swh/lister/github/tests/test_lister.py
 swh/lister/github/tests/test_tasks.py
 swh/lister/gitlab/__init__.py
 swh/lister/gitlab/lister.py
 swh/lister/gitlab/tasks.py
 swh/lister/gitlab/tests/__init__.py
 swh/lister/gitlab/tests/test_lister.py
 swh/lister/gitlab/tests/test_tasks.py
 swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json
 swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json
 swh/lister/gitlab/tests/data/https_gitlab.com/api_response_page1.json
 swh/lister/gnu/__init__.py
 swh/lister/gnu/lister.py
 swh/lister/gnu/tasks.py
 swh/lister/gnu/tree.py
 swh/lister/gnu/tests/__init__.py
 swh/lister/gnu/tests/test_lister.py
 swh/lister/gnu/tests/test_tasks.py
 swh/lister/gnu/tests/test_tree.py
 swh/lister/gnu/tests/data/tree.json
 swh/lister/gnu/tests/data/tree.min.json
 swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz
 swh/lister/launchpad/__init__.py
 swh/lister/launchpad/lister.py
 swh/lister/launchpad/tasks.py
 swh/lister/launchpad/tests/__init__.py
 swh/lister/launchpad/tests/conftest.py
 swh/lister/launchpad/tests/test_lister.py
 swh/lister/launchpad/tests/test_tasks.py
 swh/lister/launchpad/tests/data/launchpad_response1.json
 swh/lister/launchpad/tests/data/launchpad_response2.json
 swh/lister/maven/README.md
 swh/lister/maven/__init__.py
 swh/lister/maven/lister.py
 swh/lister/maven/tasks.py
 swh/lister/maven/tests/__init__.py
 swh/lister/maven/tests/test_lister.py
 swh/lister/maven/tests/test_tasks.py
 swh/lister/maven/tests/data/http_indexes/export.fld
 swh/lister/maven/tests/data/http_indexes/export_incr.fld
 swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom
+swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom
 swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom
 swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom
 swh/lister/npm/__init__.py
 swh/lister/npm/lister.py
 swh/lister/npm/tasks.py
 swh/lister/npm/tests/test_lister.py
 swh/lister/npm/tests/test_tasks.py
 swh/lister/npm/tests/data/npm_full_page1.json
 swh/lister/npm/tests/data/npm_full_page2.json
 swh/lister/npm/tests/data/npm_incremental_page1.json
 swh/lister/npm/tests/data/npm_incremental_page2.json
 swh/lister/opam/__init__.py
 swh/lister/opam/lister.py
 swh/lister/opam/tasks.py
 swh/lister/opam/tests/__init__.py
 swh/lister/opam/tests/test_lister.py
 swh/lister/opam/tests/test_tasks.py
 swh/lister/opam/tests/data/fake_opam_repo/repo
 swh/lister/opam/tests/data/fake_opam_repo/version
 swh/lister/opam/tests/data/fake_opam_repo/packages/agrid/agrid.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.2/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.3/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.4/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.5/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.6/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.1/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.2/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.3/opam
 swh/lister/opam/tests/data/fake_opam_repo/packages/ocb/ocb.0.1/opam
 swh/lister/packagist/__init__.py
 swh/lister/packagist/lister.py
 swh/lister/packagist/tasks.py
 swh/lister/packagist/tests/__init__.py
 swh/lister/packagist/tests/test_lister.py
 swh/lister/packagist/tests/test_tasks.py
 swh/lister/packagist/tests/data/den1n_contextmenu.json
 swh/lister/packagist/tests/data/ljjackson_linnworks.json
 swh/lister/packagist/tests/data/lky_wx_article.json
 swh/lister/packagist/tests/data/spryker-eco_computop-api.json
 swh/lister/phabricator/__init__.py
 swh/lister/phabricator/lister.py
 swh/lister/phabricator/tasks.py
 swh/lister/phabricator/tests/__init__.py
 swh/lister/phabricator/tests/test_lister.py
 swh/lister/phabricator/tests/test_tasks.py
 swh/lister/phabricator/tests/data/__init__.py
 swh/lister/phabricator/tests/data/phabricator_api_repositories_page1.json
 swh/lister/phabricator/tests/data/phabricator_api_repositories_page2.json
 swh/lister/pypi/__init__.py
 swh/lister/pypi/lister.py
 swh/lister/pypi/tasks.py
 swh/lister/pypi/tests/__init__.py
 swh/lister/pypi/tests/test_lister.py
 swh/lister/pypi/tests/test_tasks.py
 swh/lister/sourceforge/__init__.py
 swh/lister/sourceforge/lister.py
 swh/lister/sourceforge/tasks.py
 swh/lister/sourceforge/tests/__init__.py
 swh/lister/sourceforge/tests/test_lister.py
 swh/lister/sourceforge/tests/test_tasks.py
 swh/lister/sourceforge/tests/data/adobexmp.json
 swh/lister/sourceforge/tests/data/backapps-website.json
 swh/lister/sourceforge/tests/data/backapps.json
 swh/lister/sourceforge/tests/data/main-sitemap.xml
 swh/lister/sourceforge/tests/data/mojunk.json
 swh/lister/sourceforge/tests/data/mramm.json
 swh/lister/sourceforge/tests/data/os3dmodels.json
 swh/lister/sourceforge/tests/data/random-mercurial.json
 swh/lister/sourceforge/tests/data/subsitemap-0.xml
 swh/lister/sourceforge/tests/data/subsitemap-1.xml
 swh/lister/tests/__init__.py
 swh/lister/tests/test_cli.py
 swh/lister/tests/test_pattern.py
 swh/lister/tests/test_utils.py
 swh/lister/tuleap/__init__.py
 swh/lister/tuleap/lister.py
 swh/lister/tuleap/tasks.py
 swh/lister/tuleap/tests/__init__.py
 swh/lister/tuleap/tests/test_lister.py
 swh/lister/tuleap/tests/test_tasks.py
 swh/lister/tuleap/tests/data/https_tuleap.net/projects
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_1
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_2
 swh/lister/tuleap/tests/data/https_tuleap.net/repo_3
\ No newline at end of file
diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt
index 847447b..e4f915e 100644
--- a/swh.lister.egg-info/requires.txt
+++ b/swh.lister.egg-info/requires.txt
@@ -1,18 +1,18 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
-tenacity
+tenacity>=6.2
 xmltodict
 swh.core[db]>=0.9
 swh.scheduler>=0.8
 
 [testing]
-pytest
+pytest<7.0.0
 pytest-mock
 requests_mock
 types-click
 types-pyyaml
 types-requests
diff --git a/swh/__init__.py b/swh/__init__.py
index 8d9f151..b36383a 100644
--- a/swh/__init__.py
+++ b/swh/__init__.py
@@ -1,4 +1,3 @@
 from pkgutil import extend_path
-from typing import List
 
-__path__: List[str] = extend_path(__path__, __name__)
+__path__ = extend_path(__path__, __name__)
diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py
index 6c3f7e6..61006b0 100644
--- a/swh/lister/gitlab/lister.py
+++ b/swh/lister/gitlab/lister.py
@@ -1,265 +1,265 @@
 # Copyright (C) 2018-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import logging
 import random
 from typing import Any, Dict, Iterator, Optional, Tuple
 from urllib.parse import parse_qs, urlencode, urlparse
 
 import iso8601
 import requests
 from requests.exceptions import HTTPError
 from requests.status_codes import codes
 from tenacity.before_sleep import before_sleep_log
 
 from swh.lister import USER_AGENT
 from swh.lister.pattern import CredentialsType, Lister
-from swh.lister.utils import is_retryable_exception, retry_attempt, throttling_retry
+from swh.lister.utils import is_retryable_exception, throttling_retry
 from swh.scheduler.model import ListedOrigin
 
 logger = logging.getLogger(__name__)
 
 
 # Some instance provides hg_git type which can be ingested as hg origins
 VCS_MAPPING = {"hg_git": "hg"}
 
 
 @dataclass
 class GitLabListerState:
     """State of the GitLabLister"""
 
     last_seen_next_link: Optional[str] = None
     """Last link header (not visited yet) during an incremental pass
 
     """
 
 
 Repository = Dict[str, Any]
 
 
 @dataclass
 class PageResult:
     """Result from a query to a gitlab project api page."""
 
     repositories: Optional[Tuple[Repository, ...]] = None
     next_page: Optional[str] = None
 
 
 def _if_rate_limited(retry_state) -> bool:
     """Custom tenacity retry predicate for handling HTTP responses with status code 403
     with specific ratelimit header.
 
     """
-    attempt = retry_attempt(retry_state)
+    attempt = retry_state.outcome
     if attempt.failed:
         exc = attempt.exception()
         return (
             isinstance(exc, HTTPError)
             and exc.response.status_code == codes.forbidden
             and int(exc.response.headers.get("RateLimit-Remaining", "0")) == 0
         ) or is_retryable_exception(exc)
     return False
 
 
 def _parse_id_after(url: Optional[str]) -> Optional[int]:
     """Given an url, extract a return the 'id_after' query parameter associated value
     or None.
 
     This is the the repository id used for pagination purposes.
 
     """
     if not url:
         return None
     # link: https://${project-api}/?...&id_after=2x...
     query_data = parse_qs(urlparse(url).query)
     page = query_data.get("id_after")
     if page and len(page) > 0:
         return int(page[0])
     return None
 
 
 class GitLabLister(Lister[GitLabListerState, PageResult]):
     """List origins for a gitlab instance.
 
     By default, the lister runs in incremental mode: it lists all repositories,
     starting with the `last_seen_next_link` stored in the scheduler backend.
 
     Args:
         scheduler: a scheduler instance
         url: the api v4 url of the gitlab instance to visit (e.g.
           https://gitlab.com/api/v4/)
         instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...),
             url network location will be used if not provided
         incremental: defines if incremental listing is activated or not
 
     """
 
     def __init__(
         self,
         scheduler,
         url: str,
         name: Optional[str] = "gitlab",
         instance: Optional[str] = None,
         credentials: Optional[CredentialsType] = None,
         incremental: bool = False,
     ):
         if name is not None:
             self.LISTER_NAME = name
         super().__init__(
             scheduler=scheduler,
             url=url.rstrip("/"),
             instance=instance,
             credentials=credentials,
         )
         self.incremental = incremental
         self.last_page: Optional[str] = None
         self.per_page = 100
 
         self.session = requests.Session()
         self.session.headers.update(
             {"Accept": "application/json", "User-Agent": USER_AGENT}
         )
 
         if len(self.credentials) > 0:
             cred = random.choice(self.credentials)
             logger.info(
                 "Using %s credentials from user %s", self.instance, cred["username"]
             )
             api_token = cred["password"]
             if api_token:
                 self.session.headers["Authorization"] = f"Bearer {api_token}"
 
     def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState:
         return GitLabListerState(**d)
 
     def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
         return asdict(state)
 
     @throttling_retry(
         retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
     )
     def get_page_result(self, url: str) -> PageResult:
         logger.debug("Fetching URL %s", url)
         response = self.session.get(url)
         if response.status_code != 200:
             logger.warning(
                 "Unexpected HTTP status code %s on %s: %s",
                 response.status_code,
                 response.url,
                 response.content,
             )
 
         # GitLab API can return errors 500 when listing projects.
         # https://gitlab.com/gitlab-org/gitlab/-/issues/262629
         # To avoid ending the listing prematurely, skip buggy URLs and move
         # to next pages.
         if response.status_code == 500:
             id_after = _parse_id_after(url)
             assert id_after is not None
             while True:
                 next_id_after = id_after + self.per_page
                 url = url.replace(f"id_after={id_after}", f"id_after={next_id_after}")
                 response = self.session.get(url)
                 if response.status_code == 200:
                     break
                 else:
                     id_after = next_id_after
         else:
             response.raise_for_status()
 
         repositories: Tuple[Repository, ...] = tuple(response.json())
         if hasattr(response, "links") and response.links.get("next"):
             next_page = response.links["next"]["url"]
         else:
             next_page = None
 
         return PageResult(repositories, next_page)
 
     def page_url(self, id_after: Optional[int] = None) -> str:
         parameters = {
             "pagination": "keyset",
             "order_by": "id",
             "sort": "asc",
             "simple": "true",
             "per_page": f"{self.per_page}",
         }
         if id_after is not None:
             parameters["id_after"] = str(id_after)
         return f"{self.url}/projects?{urlencode(parameters)}"
 
     def get_pages(self) -> Iterator[PageResult]:
         next_page: Optional[str]
         if self.incremental and self.state and self.state.last_seen_next_link:
             next_page = self.state.last_seen_next_link
         else:
             next_page = self.page_url()
 
         while next_page:
             self.last_page = next_page
             page_result = self.get_page_result(next_page)
             yield page_result
             next_page = page_result.next_page
 
     def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
 
         repositories = page_result.repositories if page_result.repositories else []
         for repo in repositories:
             visit_type = repo.get("vcs_type", "git")
             visit_type = VCS_MAPPING.get(visit_type, visit_type)
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=repo["http_url_to_repo"],
                 visit_type=visit_type,
                 last_update=iso8601.parse_date(repo["last_activity_at"]),
             )
 
     def commit_page(self, page_result: PageResult) -> None:
         """Update currently stored state using the latest listed "next" page if relevant.
 
         Relevancy is determined by the next_page link whose 'page' id must be strictly
         superior to the currently stored one.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental:
             # link: https://${project-api}/?...&page=2x...
             next_page = page_result.next_page
             if not next_page and self.last_page:
                 next_page = self.last_page
 
             if next_page:
                 id_after = _parse_id_after(next_page)
                 previous_next_page = self.state.last_seen_next_link
                 previous_id_after = _parse_id_after(previous_next_page)
 
                 if previous_next_page is None or (
                     previous_id_after and id_after and previous_id_after < id_after
                 ):
                     self.state.last_seen_next_link = next_page
 
     def finalize(self) -> None:
         """finalize the lister state when relevant (see `fn:commit_page` for details)
 
         Note: this is a noop for full listing mode
 
         """
         next_page = self.state.last_seen_next_link
         if self.incremental and next_page:
             # link: https://${project-api}/?...&page=2x...
             next_id_after = _parse_id_after(next_page)
             scheduler_state = self.get_state_from_scheduler()
             previous_next_id_after = _parse_id_after(
                 scheduler_state.last_seen_next_link
             )
 
             if (not previous_next_id_after and next_id_after) or (
                 previous_next_id_after
                 and next_id_after
                 and previous_next_id_after < next_id_after
             ):
                 self.updated = True
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
index 01f6060..f5601b8 100644
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -1,341 +1,361 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
 import logging
 import re
 from typing import Any, Dict, Iterator, Optional
 from urllib.parse import urljoin
 
 import requests
 from tenacity.before_sleep import before_sleep_log
 from urllib3.util import parse_url
 import xmltodict
 
 from swh.lister.utils import throttling_retry
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 RepoPage = Dict[str, Any]
 
 
 @dataclass
 class MavenListerState:
     """State of the MavenLister"""
 
     last_seen_doc: int = -1
     """Last doc ID ingested during an incremental pass
 
     """
 
     last_seen_pom: int = -1
     """Last doc ID related to a pom and ingested during
        an incremental pass
 
     """
 
 
 class MavenLister(Lister[MavenListerState, RepoPage]):
     """List origins from a Maven repository.
 
     Maven Central provides artifacts for Java builds.
     It includes POM files and source archives, which we download to get
     the source code of artifacts and links to their scm repository.
 
     This lister yields origins of types: git/svn/hg or whatever the Artifacts
     use as repository type, plus maven types for the maven loader (tgz, jar)."""
 
     LISTER_NAME = "maven"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         index_url: str = None,
         instance: Optional[str] = None,
         credentials: CredentialsType = None,
         incremental: bool = True,
     ):
         """Lister class for Maven repositories.
 
         Args:
             url: main URL of the Maven repository, i.e. url of the base index
                 used to fetch maven artifacts. For Maven central use
                 https://repo1.maven.org/maven2/
             index_url: the URL to download the exported text indexes from.
                 Would typically be a local host running the export docker image.
                 See README.md in this directory for more information.
             instance: Name of maven instance. Defaults to url's network location
                 if unset.
             incremental: bool, defaults to True. Defines if incremental listing
                 is activated or not.
 
         """
         self.BASE_URL = url
         self.INDEX_URL = index_url
         self.incremental = incremental
 
         if instance is None:
             instance = parse_url(url).host
 
         super().__init__(
             scheduler=scheduler, credentials=credentials, url=url, instance=instance,
         )
 
         self.session = requests.Session()
         self.session.headers.update(
             {"Accept": "application/json", "User-Agent": USER_AGENT,}
         )
 
     def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
         return MavenListerState(**d)
 
     def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
         return asdict(state)
 
     @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
     def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
 
         logger.info("Fetching URL %s with params %s", url, params)
 
         response = self.session.get(url, params=params)
         if response.status_code != 200:
             logger.warning(
                 "Unexpected HTTP status code %s on %s: %s",
                 response.status_code,
                 response.url,
                 response.content,
             )
         response.raise_for_status()
 
         return response
 
     def get_pages(self) -> Iterator[RepoPage]:
         """ Retrieve and parse exported maven indexes to
         identify all pom files and src archives.
         """
 
         # Example of returned RepoPage's:
         # [
         #   {
         #     "type": "maven",
         #     "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
         #     "time": 1626109619335,
         #     "gid": "org.xwiki.platform",
         #     "aid": "xwiki-platform-wikistream-events-xwiki",
         #     "version": "5.4.2"
         #   },
         #   {
         #     "type": "scm",
         #     "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
         #     "project": "openengsb-framework",
         #   },
         #   ...
         # ]
 
         # Download the main text index file.
-        logger.info(f"Downloading text index from {self.INDEX_URL}.")
+        logger.info("Downloading text index from %s.", self.INDEX_URL)
         assert self.INDEX_URL is not None
         response = requests.get(self.INDEX_URL, stream=True)
         response.raise_for_status()
 
         # Prepare regexes to parse index exports.
 
         # Parse doc id.
         # Example line: "doc 13"
         re_doc = re.compile(r"^doc (?P<doc>\d+)$")
 
         # Parse gid, aid, version, classifier, extension.
         # Example line: "    value al.aldi|sprova4j|0.1.0|sources|jar"
         re_val = re.compile(
             r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"
             + r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"
         )
 
         # Parse last modification time.
         # Example line: "    value jar|1626109619335|14316|2|2|0|jar"
         re_time = re.compile(
             r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
             + r"\|([^|]+)\|([^|]+)$"
         )
 
         # Read file line by line and process it
         out_pom: Dict = {}
         jar_src: Dict = {}
         doc_id: int = 0
         jar_src["doc"] = None
         url_src = None
 
         iterator = response.iter_lines(chunk_size=1024)
         for line_bytes in iterator:
             # Read the index text export and get URLs and SCMs.
             line = line_bytes.decode(errors="ignore")
             m_doc = re_doc.match(line)
             if m_doc is not None:
                 doc_id = int(m_doc.group("doc"))
                 if (
                     self.incremental
                     and self.state
                     and self.state.last_seen_doc
                     and self.state.last_seen_doc >= doc_id
                 ):
                     # jar_src["doc"] contains the id of the current document, whatever
                     # its type (scm or jar).
                     jar_src["doc"] = None
                 else:
                     jar_src["doc"] = doc_id
             else:
                 # If incremental mode, we don't record any line that is
                 # before our last recorded doc id.
                 if self.incremental and jar_src["doc"] is None:
                     continue
                 m_val = re_val.match(line)
                 if m_val is not None:
                     (gid, aid, version, classifier, ext) = m_val.groups()
                     ext = ext.strip()
                     path = "/".join(gid.split("."))
                     if classifier == "NA" and ext.lower() == "pom":
                         # If incremental mode, we don't record any line that is
                         # before our last recorded doc id.
                         if (
                             self.incremental
                             and self.state
                             and self.state.last_seen_pom
                             and self.state.last_seen_pom >= doc_id
                         ):
                             continue
                         url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"
                         url_pom = urljoin(self.BASE_URL, url_path,)
                         out_pom[url_pom] = doc_id
                     elif (
                         classifier.lower() == "sources" or ("src" in classifier)
                     ) and ext.lower() in ("zip", "jar"):
                         url_path = (
                             f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"
                         )
                         url_src = urljoin(self.BASE_URL, url_path)
                         jar_src["gid"] = gid
                         jar_src["aid"] = aid
                         jar_src["version"] = version
                 else:
                     m_time = re_time.match(line)
                     if m_time is not None and url_src is not None:
                         time = m_time.group("mtime")
                         jar_src["time"] = int(time)
-                        logger.debug(f"* Yielding jar {url_src}.")
-                        yield {
+                        artifact_metadata_d = {
                             "type": "maven",
                             "url": url_src,
                             **jar_src,
                         }
+                        logger.debug(
+                            "* Yielding jar %s: %s", url_src, artifact_metadata_d
+                        )
+                        yield artifact_metadata_d
                         url_src = None
 
-        logger.info(f"Found {len(out_pom)} poms.")
+        logger.info("Found %s poms.", len(out_pom))
 
         # Now fetch pom files and scan them for scm info.
 
         logger.info("Fetching poms..")
         for pom in out_pom:
             text = self.page_request(pom, {})
             try:
                 project = xmltodict.parse(text.content.decode())
                 if "scm" in project["project"]:
                     if "connection" in project["project"]["scm"]:
                         scm = project["project"]["scm"]["connection"]
                         gid = project["project"]["groupId"]
                         aid = project["project"]["artifactId"]
-                        yield {
+                        artifact_metadata_d = {
                             "type": "scm",
                             "doc": out_pom[pom],
                             "url": scm,
                             "project": f"{gid}.{aid}",
                         }
+                        logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
+                        yield artifact_metadata_d
                     else:
-                        logger.debug(f"No scm.connection in pom {pom}")
+                        logger.debug("No scm.connection in pom %s", pom)
                 else:
-                    logger.debug(f"No scm in pom {pom}")
+                    logger.debug("No scm in pom %s", pom)
             except xmltodict.expat.ExpatError as error:
-                logger.info(f"Could not parse POM {pom} XML: {error}. Next.")
+                logger.info("Could not parse POM %s XML: %s. Next.", pom, error)
 
     def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
         """Convert a page of Maven repositories into a list of ListedOrigins.
 
         """
         assert self.lister_obj.id is not None
+        scm_types_ok = ("git", "svn", "hg", "cvs", "bzr")
         if page["type"] == "scm":
             # If origin is a scm url: detect scm type and yield.
             # Note that the official format is:
             # scm:git:git://github.com/openengsb/openengsb-framework.git
             # but many, many projects directly put the repo url, so we have to
             # detect the content to match it properly.
             m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
             if m_scm is not None:
                 scm_type = m_scm.group("type")
-                scm_url = m_scm.group("url")
-                origin = ListedOrigin(
-                    lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type,
-                )
-                yield origin
+                if scm_type in scm_types_ok:
+                    scm_url = m_scm.group("url")
+                    origin = ListedOrigin(
+                        lister_id=self.lister_obj.id, url=scm_url, visit_type=scm_type,
+                    )
+                    yield origin
             else:
                 if page["url"].endswith(".git"):
                     origin = ListedOrigin(
                         lister_id=self.lister_obj.id, url=page["url"], visit_type="git",
                     )
                     yield origin
         else:
             # Origin is a source archive:
+            last_update_dt = None
+            last_update_iso = ""
+            last_update_seconds = str(page["time"])[:-3]
+            try:
+                last_update_dt = datetime.fromtimestamp(int(last_update_seconds))
+                last_update_dt_tz = last_update_dt.astimezone(timezone.utc)
+            except OverflowError:
+                logger.warning("- Failed to convert datetime %s.", last_update_seconds)
+            if last_update_dt:
+                last_update_iso = last_update_dt_tz.isoformat()
             origin = ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=page["url"],
                 visit_type=page["type"],
+                last_update=last_update_dt,
                 extra_loader_arguments={
                     "artifacts": [
                         {
-                            "time": page["time"],
+                            "time": last_update_iso,
                             "gid": page["gid"],
                             "aid": page["aid"],
                             "version": page["version"],
+                            "base_url": self.BASE_URL,
                         }
                     ]
                 },
             )
             yield origin
 
     def commit_page(self, page: RepoPage) -> None:
         """Update currently stored state using the latest listed doc.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental and self.state:
             # We need to differentiate the two state counters according
             # to the type of origin.
             if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:
                 self.state.last_seen_doc = page["doc"]
             elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:
                 self.state.last_seen_doc = page["doc"]
                 self.state.last_seen_pom = page["doc"]
 
     def finalize(self) -> None:
         """Finalize the lister state, set update if any progress has been made.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental and self.state:
             last_seen_doc = self.state.last_seen_doc
             last_seen_pom = self.state.last_seen_pom
 
             scheduler_state = self.get_state_from_scheduler()
             if last_seen_doc and last_seen_pom:
                 if (scheduler_state.last_seen_doc < last_seen_doc) or (
                     scheduler_state.last_seen_pom < last_seen_pom
                 ):
                     self.updated = True
diff --git a/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom
new file mode 100644
index 0000000..8234786
--- /dev/null
+++ b/swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>al.aldi</groupId>
+  <artifactId>sprova4j</artifactId>
+  <version>0.1.0</version>
+  <name>sprova4j</name>
+  <description>Java client for Sprova Test Management</description>
+  <url>https://github.com/aldialimucaj/sprova4j</url>
+  <inceptionYear>2018</inceptionYear>
+  <licenses>
+    <license>
+      <name>The Apache Software License, Version 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+  <developers>
+    <developer>
+      <id>aldi</id>
+      <name>Aldi Alimucaj</name>
+      <email>aldi.alimucaj@gmail.com</email>
+    </developer>
+  </developers>
+  <scm>
+    <connection>scm:https://github.com/aldialimucaj/sprova4j.git</connection>
+    <developerConnection>scm:ghttps://github.com/aldialimucaj/sprova4j.git</developerConnection>
+    <url>https://github.com/aldialimucaj/sprova4j</url>
+  </scm>
+  <dependencies>
+    <dependency>
+      <groupId>ch.qos.logback</groupId>
+      <artifactId>logback-classic</artifactId>
+      <version>1.2.3</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+      <version>2.8.3</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>okhttp</artifactId>
+      <version>3.10.0</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okio</groupId>
+      <artifactId>okio</artifactId>
+      <version>1.0.0</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.glassfish</groupId>
+      <artifactId>javax.json</artifactId>
+      <version>1.1.2</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.json</groupId>
+      <artifactId>javax.json-api</artifactId>
+      <version>1.1.2</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>javax.validation</groupId>
+      <artifactId>validation-api</artifactId>
+      <version>2.0.1.Final</version>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.12</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>mockwebserver</artifactId>
+      <version>3.10.0</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
index 36a214d..c81ee96 100644
--- a/swh/lister/maven/tests/test_lister.py
+++ b/swh/lister/maven/tests/test_lister.py
@@ -1,252 +1,320 @@
 # Copyright (C) 2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from datetime import timezone
 from pathlib import Path
 
+import iso8601
 import pytest
 import requests
 
 from swh.lister.maven.lister import MavenLister
 
 MVN_URL = "https://repo1.maven.org/maven2/"  # main maven repo url
 INDEX_URL = "http://indexes/export.fld"  # index directory url
 
 URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
 URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
 URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
 
 LIST_GIT = (
     "git://github.com/aldialimucaj/sprova4j.git",
     "https://github.com/aldialimucaj/sprova4j.git",
 )
 
 LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
 
 LIST_SRC = (
     MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0-sources.jar",
     MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1-sources.jar",
 )
 
 LIST_SRC_DATA = (
     {
         "type": "maven",
         "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
         + "/0.1.0/sprova4j-0.1.0-sources.jar",
-        "time": 1626109619335,
+        "time": "2021-07-12T17:06:59+00:00",
         "gid": "al.aldi",
         "aid": "sprova4j",
         "version": "0.1.0",
     },
     {
         "type": "maven",
         "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
         + "/0.1.1/sprova4j-0.1.1-sources.jar",
-        "time": 1626111425534,
+        "time": "2021-07-12T17:37:05+00:00",
         "gid": "al.aldi",
         "aid": "sprova4j",
         "version": "0.1.1",
     },
 )
 
 
 @pytest.fixture
 def maven_index(datadir) -> str:
-    text = Path(datadir, "http_indexes", "export.fld").read_text()
-    return text
+    return Path(datadir, "http_indexes", "export.fld").read_text()
 
 
 @pytest.fixture
 def maven_index_incr(datadir) -> str:
-    text = Path(datadir, "http_indexes", "export_incr.fld").read_text()
-    return text
+    return Path(datadir, "http_indexes", "export_incr.fld").read_text()
 
 
 @pytest.fixture
 def maven_pom_1(datadir) -> str:
-    text = Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text()
-    return text
+    return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_text()
+
+
+@pytest.fixture
+def maven_pom_1_malformed(datadir) -> str:
+    return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_text()
 
 
 @pytest.fixture
 def maven_pom_2(datadir) -> str:
-    text = Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text()
-    return text
+    return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_text()
 
 
 @pytest.fixture
 def maven_pom_3(datadir) -> str:
-    text = Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text()
-    return text
+    return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_text()
 
 
 def test_maven_full_listing(
     swh_scheduler, requests_mock, mocker, maven_index, maven_pom_1, maven_pom_2,
 ):
     """Covers full listing of multiple pages, checking page results and listed
     origins, statelessness."""
 
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=False,
     )
 
     # Set up test.
     index_text = maven_index
     requests_mock.get(INDEX_URL, text=index_text)
     requests_mock.get(URL_POM_1, text=maven_pom_1)
     requests_mock.get(URL_POM_2, text=maven_pom_2)
 
     # Then run the lister.
     stats = lister.run()
 
     # Start test checks.
     assert stats.pages == 4
     assert stats.origins == 4
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     origin_urls = [origin.url for origin in scheduler_origins]
     assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC)
 
     for origin in scheduler_origins:
         if origin.visit_type == "maven":
             for src in LIST_SRC_DATA:
                 if src.get("url") == origin.url:
+                    last_update_src = iso8601.parse_date(src.get("time")).astimezone(
+                        tz=timezone.utc
+                    )
+                    assert last_update_src == origin.last_update
                     artifact = origin.extra_loader_arguments["artifacts"][0]
                     assert src.get("time") == artifact["time"]
                     assert src.get("gid") == artifact["gid"]
                     assert src.get("aid") == artifact["aid"]
                     assert src.get("version") == artifact["version"]
+                    assert MVN_URL == artifact["base_url"]
                     break
             else:
-                raise AssertionError
+                raise AssertionError(
+                    "Could not find scheduler origin in referenced origins."
+                )
+    scheduler_state = lister.get_state_from_scheduler()
+    assert scheduler_state is not None
+    assert scheduler_state.last_seen_doc == -1
+    assert scheduler_state.last_seen_pom == -1
+
+
+def test_maven_full_listing_malformed(
+    swh_scheduler,
+    requests_mock,
+    mocker,
+    maven_index,
+    maven_pom_1_malformed,
+    maven_pom_2,
+):
+    """Covers full listing of multiple pages, checking page results with a malformed
+    scm entry in pom."""
+
+    lister = MavenLister(
+        scheduler=swh_scheduler,
+        url=MVN_URL,
+        instance="maven.org",
+        index_url=INDEX_URL,
+        incremental=False,
+    )
+
+    # Set up test.
+    index_text = maven_index
+    requests_mock.get(INDEX_URL, text=index_text)
+    requests_mock.get(URL_POM_1, text=maven_pom_1_malformed)
+    requests_mock.get(URL_POM_2, text=maven_pom_2)
+
+    # Then run the lister.
+    stats = lister.run()
+
+    # Start test checks.
+    assert stats.pages == 4
+    assert stats.origins == 3
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    origin_urls = [origin.url for origin in scheduler_origins]
+    LIST_SRC_1 = ("https://github.com/aldialimucaj/sprova4j.git",)
+    assert sorted(origin_urls) == sorted(LIST_SRC_1 + LIST_SRC)
+
+    for origin in scheduler_origins:
+        if origin.visit_type == "maven":
+            for src in LIST_SRC_DATA:
+                if src.get("url") == origin.url:
+                    artifact = origin.extra_loader_arguments["artifacts"][0]
+                    assert src.get("time") == artifact["time"]
+                    assert src.get("gid") == artifact["gid"]
+                    assert src.get("aid") == artifact["aid"]
+                    assert src.get("version") == artifact["version"]
+                    assert MVN_URL == artifact["base_url"]
+                    break
+            else:
+                raise AssertionError(
+                    "Could not find scheduler origin in referenced origins."
+                )
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == -1
     assert scheduler_state.last_seen_pom == -1
 
 
 def test_maven_incremental_listing(
     swh_scheduler,
     requests_mock,
     mocker,
     maven_index,
     maven_index_incr,
     maven_pom_1,
     maven_pom_2,
     maven_pom_3,
 ):
     """Covers full listing of multiple pages, checking page results and listed
     origins, with a second updated run for statefulness."""
 
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=True,
     )
 
     # Set up test.
     requests_mock.get(INDEX_URL, text=maven_index)
     requests_mock.get(URL_POM_1, text=maven_pom_1)
     requests_mock.get(URL_POM_2, text=maven_pom_2)
 
     # Then run the lister.
     stats = lister.run()
 
     # Start test checks.
     assert lister.incremental
     assert lister.updated
     assert stats.pages == 4
     assert stats.origins == 4
 
     # Second execution of the lister, incremental mode
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=True,
     )
 
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == 3
     assert scheduler_state.last_seen_pom == 3
 
     # Set up test.
     requests_mock.get(INDEX_URL, text=maven_index_incr)
     requests_mock.get(URL_POM_3, text=maven_pom_3)
 
     # Then run the lister.
     stats = lister.run()
 
     # Start test checks.
     assert lister.incremental
     assert lister.updated
     assert stats.pages == 1
     assert stats.origins == 1
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
     origin_urls = [origin.url for origin in scheduler_origins]
     assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR)
 
     for origin in scheduler_origins:
         if origin.visit_type == "maven":
             for src in LIST_SRC_DATA:
                 if src.get("url") == origin.url:
                     artifact = origin.extra_loader_arguments["artifacts"][0]
                     assert src.get("time") == artifact["time"]
                     assert src.get("gid") == artifact["gid"]
                     assert src.get("aid") == artifact["aid"]
                     assert src.get("version") == artifact["version"]
                     break
             else:
                 raise AssertionError
 
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == 4
     assert scheduler_state.last_seen_pom == 4
 
 
 @pytest.mark.parametrize("http_code", [400, 404, 500, 502])
 def test_maven_list_http_error(
     swh_scheduler, requests_mock, mocker, maven_index, http_code
 ):
     """Test handling of some common HTTP errors:
     - 400: Bad request.
     - 404: Resource no found.
     - 500: Internal server error.
     - 502: Bad gateway ou proxy Error.
     """
 
     lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
 
     # Test failure of index retrieval.
 
     requests_mock.get(INDEX_URL, status_code=http_code)
 
     with pytest.raises(requests.HTTPError):
         lister.run()
 
     # Test failure of artefacts retrieval.
 
     requests_mock.get(INDEX_URL, text=maven_index)
     requests_mock.get(URL_POM_1, status_code=http_code)
 
     with pytest.raises(requests.HTTPError):
         lister.run()
 
     # If the maven_index step succeeded but not the get_pom step,
     # then we get only the 2 maven-jar origins (and not the 2 additional
     # src origins).
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     assert len(scheduler_origins) == 2
diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py
index 3df721f..abc27ec 100644
--- a/swh/lister/pypi/lister.py
+++ b/swh/lister/pypi/lister.py
@@ -1,181 +1,181 @@
 # Copyright (C) 2018-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
 from dataclasses import asdict, dataclass
 from datetime import datetime, timezone
 import logging
 from time import sleep
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 from xmlrpc.client import Fault, ServerProxy
 
 from tenacity.before_sleep import before_sleep_log
 
-from swh.lister.utils import retry_attempt, throttling_retry
+from swh.lister.utils import throttling_retry
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 # Type returned by the XML-RPC changelog call:
 # package, version, release timestamp, description, serial
 ChangelogEntry = Tuple[str, str, int, str, int]
 # Manipulated package updated type which is a subset information
 # of the ChangelogEntry type: package, max release date
 PackageUpdate = Tuple[str, datetime]
 # Type returned by listing a page of results
 PackageListPage = List[PackageUpdate]
 
 
 @dataclass
 class PyPIListerState:
     """State of PyPI lister"""
 
     last_serial: Optional[int] = None
     """Last seen serial when visiting the pypi instance"""
 
 
 def _if_rate_limited(retry_state) -> bool:
     """Custom tenacity retry predicate to handle xmlrpc client error:
 
     .. code::
 
         xmlrpc.client.Fault: <Fault -32500: 'HTTPTooManyRequests: The action could not
         be performed because there were too many requests by the client. Limit may reset
         in 1 seconds.'>
 
     """
-    attempt = retry_attempt(retry_state)
+    attempt = retry_state.outcome
     return attempt.failed and isinstance(attempt.exception(), Fault)
 
 
 def pypi_url(package_name: str) -> str:
     """Build pypi url out of a package name.
 
     """
     return PyPILister.PACKAGE_URL.format(package_name=package_name)
 
 
 class PyPILister(Lister[PyPIListerState, PackageListPage]):
     """List origins from PyPI.
 
     """
 
     LISTER_NAME = "pypi"
     INSTANCE = "pypi"  # As of today only the main pypi.org is used
     PACKAGE_LIST_URL = "https://pypi.org/pypi"  # XML-RPC url
     PACKAGE_URL = "https://pypi.org/project/{package_name}/"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             url=self.PACKAGE_LIST_URL,
             instance=self.INSTANCE,
             credentials=credentials,
         )
 
         # used as termination condition and if useful, becomes the new state when the
         # visit is done
         self.last_processed_serial: Optional[int] = None
 
     def state_from_dict(self, d: Dict[str, Any]) -> PyPIListerState:
         return PyPIListerState(last_serial=d.get("last_serial"))
 
     def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
         return asdict(state)
 
     @throttling_retry(
         retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
     )
     def _changelog_last_serial(self, client: ServerProxy) -> int:
         """Internal detail to allow throttling when calling the changelog last entry"""
         serial = client.changelog_last_serial()
         assert isinstance(serial, int)
         return serial
 
     @throttling_retry(
         retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
     )
     def _changelog_since_serial(
         self, client: ServerProxy, serial: int
     ) -> List[ChangelogEntry]:
         """Internal detail to allow throttling when calling the changelog listing"""
         sleep(1)  # to avoid the initial warning about throttling
         return client.changelog_since_serial(serial)  # type: ignore
 
     def get_pages(self) -> Iterator[PackageListPage]:
         """Iterate other changelog events per package, determine the max release date for that
            package and use that max release date as last_update. When the execution is
            done, this will also set the self.last_processed_serial attribute so we can
            finalize the state of the lister for the next visit.
 
         Yields:
             List of Tuple of (package-name, max release-date)
 
         """
         client = ServerProxy(self.url)
 
         last_processed_serial = -1
         if self.state.last_serial is not None:
             last_processed_serial = self.state.last_serial
         upstream_last_serial = self._changelog_last_serial(client)
 
         # Paginate through result of pypi, until we read everything
         while last_processed_serial < upstream_last_serial:
             updated_packages = defaultdict(list)
 
             for package, _, release_date, _, serial in self._changelog_since_serial(
                 client, last_processed_serial
             ):
                 updated_packages[package].append(release_date)
                 # Compute the max serial so we can stop when done
                 last_processed_serial = max(last_processed_serial, serial)
 
             # Returns pages of result to flush regularly
             yield [
                 (
                     pypi_url(package),
                     datetime.fromtimestamp(max(release_dates)).replace(
                         tzinfo=timezone.utc
                     ),
                 )
                 for package, release_dates in updated_packages.items()
             ]
 
         self.last_processed_serial = upstream_last_serial
 
     def get_origins_from_page(
         self, packages: PackageListPage
     ) -> Iterator[ListedOrigin]:
         """Convert a page of PyPI repositories into a list of ListedOrigins."""
         assert self.lister_obj.id is not None
 
         for origin, last_update in packages:
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin,
                 visit_type="pypi",
                 last_update=last_update,
             )
 
     def finalize(self):
         """Finalize the visit state by updating with the new last_serial if updates
            actually happened.
 
         """
         self.updated = (
             self.state
             and self.state.last_serial
             and self.last_processed_serial
             and self.state.last_serial < self.last_processed_serial
         ) or (not self.state.last_serial and self.last_processed_serial)
         if self.updated:
             self.state.last_serial = self.last_processed_serial
diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py
index 763f743..acb73fe 100644
--- a/swh/lister/tests/test_utils.py
+++ b/swh/lister/tests/test_utils.py
@@ -1,120 +1,113 @@
-# Copyright (C) 2018-2020 the Software Heritage developers
+# Copyright (C) 2018-2021 the Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import pytest
 import requests
 from requests.status_codes import codes
 from tenacity.wait import wait_fixed
 
 from swh.lister.utils import (
     MAX_NUMBER_ATTEMPTS,
     WAIT_EXP_BASE,
     split_range,
     throttling_retry,
 )
 
 
 @pytest.mark.parametrize(
     "total_pages,nb_pages,expected_ranges",
     [
         (14, 5, [(0, 4), (5, 9), (10, 14)]),
         (19, 10, [(0, 9), (10, 19)]),
         (20, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)]),
         (21, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21),],),
     ],
 )
 def test_split_range(total_pages, nb_pages, expected_ranges):
     actual_ranges = list(split_range(total_pages, nb_pages))
     assert actual_ranges == expected_ranges
 
 
 @pytest.mark.parametrize("total_pages,nb_pages", [(None, 1), (100, None)])
 def test_split_range_errors(total_pages, nb_pages):
     for total_pages, nb_pages in [(None, 1), (100, None)]:
         with pytest.raises(TypeError):
             next(split_range(total_pages, nb_pages))
 
 
 TEST_URL = "https://example.og/api/repositories"
 
 
 @throttling_retry()
 def make_request():
     response = requests.get(TEST_URL)
     response.raise_for_status()
     return response
 
 
 def assert_sleep_calls(mocker, mock_sleep, sleep_params):
-    try:
-        mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params])
-    except AssertionError:
-        # tenacity < 5.1 has a different behavior for wait_exponential
-        # https://github.com/jd/tenacity/commit/aac4307a0aa30d7befd0ebe4212ee4fc69083a95
-        mock_sleep.assert_has_calls(
-            [mocker.call(param * WAIT_EXP_BASE) for param in sleep_params]
-        )
+    mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params])
 
 
 def test_throttling_retry(requests_mock, mocker):
     data = {"result": {}}
     requests_mock.get(
         TEST_URL,
         [
             {"status_code": codes.too_many_requests},
             {"status_code": codes.too_many_requests},
             {"status_code": codes.ok, "json": data},
         ],
     )
 
     mock_sleep = mocker.patch.object(make_request.retry, "sleep")
 
     response = make_request()
 
     assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE])
 
     assert response.json() == data
 
 
 def test_throttling_retry_max_attemps(requests_mock, mocker):
     requests_mock.get(
         TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS),
     )
 
     mock_sleep = mocker.patch.object(make_request.retry, "sleep")
 
     with pytest.raises(requests.exceptions.HTTPError) as e:
         make_request()
 
     assert e.value.response.status_code == codes.too_many_requests
 
     assert_sleep_calls(
         mocker,
         mock_sleep,
         [float(WAIT_EXP_BASE ** i) for i in range(MAX_NUMBER_ATTEMPTS - 1)],
     )
 
 
 @throttling_retry(wait=wait_fixed(WAIT_EXP_BASE))
 def make_request_wait_fixed():
     response = requests.get(TEST_URL)
     response.raise_for_status()
     return response
 
 
 def test_throttling_retry_wait_fixed(requests_mock, mocker):
     requests_mock.get(
         TEST_URL,
         [
             {"status_code": codes.too_many_requests},
             {"status_code": codes.too_many_requests},
             {"status_code": codes.ok},
         ],
     )
 
     mock_sleep = mocker.patch.object(make_request_wait_fixed.retry, "sleep")
 
     make_request_wait_fixed()
 
     assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2)
diff --git a/swh/lister/utils.py b/swh/lister/utils.py
index 9df6907..ea4a989 100644
--- a/swh/lister/utils.py
+++ b/swh/lister/utils.py
@@ -1,134 +1,121 @@
 # Copyright (C) 2018-2021 the Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Callable, Iterator, Tuple
 
 from requests.exceptions import ConnectionError, HTTPError
 from requests.status_codes import codes
 from tenacity import retry as tenacity_retry
 from tenacity.stop import stop_after_attempt
 from tenacity.wait import wait_exponential
 
 
 def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]:
     """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can
     have one more element.
 
     >>> list(split_range(19, 10))
     [(0, 9), (10, 19)]
 
     >>> list(split_range(20, 3))
     [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)]
 
     >>> list(split_range(21, 3))
     [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21)]
 
     """
     prev_index = None
     for index in range(0, total_pages, nb_pages):
         if index is not None and prev_index is not None:
             yield prev_index, index - 1
         prev_index = index
 
     if index != total_pages:
         yield index, total_pages
 
 
 def is_throttling_exception(e: Exception) -> bool:
     """
     Checks if an exception is a requests.exception.HTTPError for
     a response with status code 429 (too many requests).
     """
     return (
         isinstance(e, HTTPError) and e.response.status_code == codes.too_many_requests
     )
 
 
 def is_retryable_exception(e: Exception) -> bool:
     """
     Checks if an exception is worth retrying (connection, throttling or a server error).
     """
     is_connection_error = isinstance(e, ConnectionError)
     is_500_error = isinstance(e, HTTPError) and e.response.status_code >= 500
 
     return is_connection_error or is_throttling_exception(e) or is_500_error
 
 
-def retry_attempt(retry_state):
-    """
-    Utility function to get last retry attempt info based on the
-    tenacity version (as debian buster packages version 4.12).
-    """
-    try:
-        attempt = retry_state.outcome
-    except AttributeError:
-        # tenacity < 5.0
-        attempt = retry_state
-    return attempt
-
-
 def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> bool:
     """
     Custom tenacity retry predicate for handling exceptions with the given predicate.
     """
-    attempt = retry_attempt(retry_state)
+    attempt = retry_state.outcome
     if attempt.failed:
         exception = attempt.exception()
         return predicate(exception)
     return False
 
 
 def retry_if_throttling(retry_state) -> bool:
     """
     Custom tenacity retry predicate for handling HTTP responses with
     status code 429 (too many requests).
     """
     return retry_if_exception(retry_state, is_throttling_exception)
 
 
 def retry_policy_generic(retry_state) -> bool:
     """
     Custom tenacity retry predicate for handling failed requests:
         - ConnectionError
         - Server errors (status >= 500)
         - Throttling errors (status == 429)
 
     This does not handle 404, 403 or other status codes.
     """
     return retry_if_exception(retry_state, is_retryable_exception)
 
 
 WAIT_EXP_BASE = 10
 MAX_NUMBER_ATTEMPTS = 5
 
 
 def throttling_retry(
     retry=retry_if_throttling,
     wait=wait_exponential(exp_base=WAIT_EXP_BASE),
     stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS),
     **retry_args,
 ):
     """
     Decorator based on `tenacity` for retrying a function possibly raising
     requests.exception.HTTPError for status code 429 (too many requests).
 
     It provides a default configuration that should work properly in most
     cases but all `tenacity.retry` parameters can also be overridden in client
     code.
 
     When the mmaximum of attempts is reached, the HTTPError exception will then
     be reraised.
 
     Args:
         retry: function defining request retry condition (default to 429 status code)
             https://tenacity.readthedocs.io/en/latest/#whether-to-retry
 
         wait: function defining wait strategy before retrying (default to exponential
             backoff) https://tenacity.readthedocs.io/en/latest/#waiting-before-retrying
 
         stop: function defining when to stop retrying (default after 5 attempts)
             https://tenacity.readthedocs.io/en/latest/#stopping
 
     """
     return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args)
diff --git a/tox.ini b/tox.ini
index 66e29ea..cef302c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,77 +1,77 @@
 [tox]
 envlist=black,flake8,mypy,py3
 
 [testenv]
 extras =
   testing
 deps =
   swh.core[http] >= 0.0.61
   swh.scheduler[testing] >= 0.5.0
   amqp != 5.0.4
   pytest-cov
   dev: ipdb
 commands =
   pytest \
   !dev: --cov={envsitepackagesdir}/swh/lister/ --cov-branch \
         --doctest-modules \
          {envsitepackagesdir}/swh/lister/ {posargs}
 
 [testenv:black]
 skip_install = true
 deps =
   black==19.10b0
 commands =
   {envpython} -m black --check swh
 
 [testenv:flake8]
 skip_install = true
 deps =
   flake8
 commands =
   {envpython} -m flake8
 
 [testenv:mypy]
 extras =
   testing
 deps =
-  mypy
+  mypy==0.920
 commands =
   mypy swh
 
 # build documentation outside swh-environment using the current
 # git HEAD of swh-docs, is executed on CI for each diff to prevent
 # breaking doc build
 [testenv:sphinx]
 whitelist_externals = make
 usedevelop = true
 extras =
   testing
 deps =
   # fetch and install swh-docs in develop mode
   -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs
 
 setenv =
   SWH_PACKAGE_DOC_TOX_BUILD = 1
   # turn warnings into errors
   SPHINXOPTS = -W
 commands =
   make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs
 
 
 # build documentation only inside swh-environment using local state
 # of swh-docs package
 [testenv:sphinx-dev]
 whitelist_externals = make
 usedevelop = true
 extras =
   testing
 deps =
   # install swh-docs in develop mode
   -e ../swh-docs
 
 setenv =
   SWH_PACKAGE_DOC_TOX_BUILD = 1
   # turn warnings into errors
   SPHINXOPTS = -W
 commands =
   make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs