diff --git a/PKG-INFO b/PKG-INFO
index 6687f51..4c7c008 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,127 +1,123 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 2.9.2
+Version: 2.9.3
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
-License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
-Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
-
-
diff --git a/conftest.py b/conftest.py
index da8b930..00eb31a 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,10 +1,10 @@
-# Copyright (C) 2020-2021  The Software Heritage developers
+# Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 
-pytest_plugins = ["swh.scheduler.pytest_plugin"]
+pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"]
 
 os.environ["LC_ALL"] = "C.UTF-8"
diff --git a/requirements-swh.txt b/requirements-swh.txt
index b451cdb..3281b3e 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,2 +1,2 @@
-swh.core[db] >= 0.9
+swh.core[db,github] >= 2.8
 swh.scheduler >= 0.8
diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO
index 6687f51..4c7c008 100644
--- a/swh.lister.egg-info/PKG-INFO
+++ b/swh.lister.egg-info/PKG-INFO
@@ -1,127 +1,123 @@
 Metadata-Version: 2.1
 Name: swh.lister
-Version: 2.9.2
+Version: 2.9.3
 Summary: Software Heritage lister
 Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
-License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/
-Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 
 swh-lister
 ==========
 
 This component from the Software Heritage stack aims to produce listings
 of software origins and their urls hosted on various public developer platforms
 or package managers. As these operations are quite similar, it provides a set of
 Python modules abstracting common software origins listing behaviors.
 
 It also provides several lister implementations, contained in the
 following Python modules:
 
 - `swh.lister.bitbucket`
 - `swh.lister.cgit`
 - `swh.lister.cran`
 - `swh.lister.debian`
 - `swh.lister.gitea`
 - `swh.lister.github`
 - `swh.lister.gitlab`
 - `swh.lister.gnu`
 - `swh.lister.launchpad`
 - `swh.lister.maven`
 - `swh.lister.npm`
 - `swh.lister.packagist`
 - `swh.lister.phabricator`
 - `swh.lister.pypi`
 - `swh.lister.tuleap`
 
 Dependencies
 ------------
 
 All required dependencies can be found in the `requirements*.txt` files located
 at the root of the repository.
 
 Local deployment
 ----------------
 
 ## lister configuration
 
 Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
 `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
 must be configured by following the instructions below (please note that you have to replace
 `<lister_name>` by one of the lister name introduced above).
 
 ### Preparation steps
 
 1. `mkdir ~/.config/swh/`
 2. create configuration file `~/.config/swh/listers.yml`
 
 ### Configuration file sample
 
 Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`:
 
 ```lang=yml
 scheduler:
   cls: 'remote'
   args:
     url: 'http://localhost:5008/'
 
 credentials: {}
 ```
 
 Note: This expects scheduler (5008) service to run locally
 
 ## Executing a lister
 
 Once configured, a lister can be executed by using the `swh` CLI tool with the
 following options and commands:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister <lister_name> [lister_parameters]
 ```
 
 Examples:
 
 ```
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm
 
 $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi
 ```
 
 Licensing
 ---------
 
 This program is free software: you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation, either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 
 See top-level LICENSE file for the full text of the GNU General Public License
 along with this program.
-
-
diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt
index 6caae43..5e69dc1 100644
--- a/swh.lister.egg-info/requires.txt
+++ b/swh.lister.egg-info/requires.txt
@@ -1,19 +1,19 @@
 python_debian
 requests
 setuptools
 iso8601
 beautifulsoup4
 launchpadlib
 tenacity>=6.2
 xmltodict
 lxml
-swh.core[db]>=0.9
+swh.core[db,github]>=2.8
 swh.scheduler>=0.8
 
 [testing]
 pytest
 pytest-mock
 requests_mock
 types-click
 types-pyyaml
 types-requests
diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py
index 2655744..acef224 100644
--- a/swh/lister/github/lister.py
+++ b/swh/lister/github/lister.py
@@ -1,208 +1,208 @@
 # Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import datetime
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Set
 from urllib.parse import parse_qs, urlparse
 
 import iso8601
 
+from swh.core.github.utils import GitHubSession, MissingRateLimitReset
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
-from .utils import GitHubSession, MissingRateLimitReset
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class GitHubListerState:
     """State of the GitHub lister"""
 
     last_seen_id: int = 0
     """Numeric id of the last repository listed on an incremental pass"""
 
 
 class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
     """List origins from GitHub.
 
     By default, the lister runs in incremental mode: it lists all repositories,
     starting with the `last_seen_id` stored in the scheduler backend.
 
     Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in
     that mode, the lister finds the origins present in the range **excluding**
     `first_id` and **including** `last_id`. In this mode, the lister can overrun the
     `last_id`: it will always record all the origins seen in a given page. As the lister
     is fully idempotent, this is not a practical problem. Once relisting completes, the
     lister state in the scheduler backend is not updated.
 
     When the config contains a set of credentials, we shuffle this list at the beginning
     of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the
     same token over and over again, until its rate limit runs out. Once that happens, we
     switch to the next token over in our shuffled list.
 
     When a request fails with a rate limit exception for all tokens, we pause the
     listing until the largest value for X-Ratelimit-Reset over all tokens.
 
     When the credentials aren't set in the lister config, the lister can run in
     anonymous mode too (e.g. for testing purposes).
 
     .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
 
 
     Args:
       first_id: the id of the first repo to list
       last_id: stop listing after seeing a repo with an id higher than this value.
 
     """  # noqa: B950
 
     LISTER_NAME = "github"
 
     API_URL = "https://api.github.com/repositories"
     PAGE_SIZE = 1000
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
         first_id: Optional[int] = None,
         last_id: Optional[int] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.API_URL,
             instance="github",
         )
 
         self.first_id = first_id
         self.last_id = last_id
 
         self.relisting = self.first_id is not None or self.last_id is not None
 
         self.github_session = GitHubSession(
             credentials=self.credentials, user_agent=USER_AGENT
         )
 
     def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
         return GitHubListerState(**d)
 
     def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
         current_id = 0
         if self.first_id is not None:
             current_id = self.first_id
         elif self.state is not None:
             current_id = self.state.last_seen_id
 
         current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}"
 
         while self.last_id is None or current_id < self.last_id:
             logger.debug("Getting page %s", current_url)
 
             try:
                 response = self.github_session.request(current_url)
             except MissingRateLimitReset:
                 # Give up
                 break
 
             # We've successfully retrieved a (non-ratelimited) `response`. We
             # still need to check it for validity.
 
             if response.status_code != 200:
                 logger.warning(
                     "Got unexpected status_code %s: %s",
                     response.status_code,
                     response.content,
                 )
                 break
 
             yield response.json()
 
             if "next" not in response.links:
                 # No `next` link, we've reached the end of the world
                 logger.debug(
                     "No next link found in the response headers, all caught up"
                 )
                 break
 
             # GitHub strongly advises to use the next link directly. We still
             # parse it to get the id of the last repository we've reached so
             # far.
             next_url = response.links["next"]["url"]
             parsed_url = urlparse(next_url)
             if not parsed_url.query:
                 logger.warning("Failed to parse url %s", next_url)
                 break
 
             parsed_query = parse_qs(parsed_url.query)
             current_id = int(parsed_query["since"][0])
             current_url = next_url
 
     def get_origins_from_page(
         self, page: List[Dict[str, Any]]
     ) -> Iterator[ListedOrigin]:
         """Convert a page of GitHub repositories into a list of ListedOrigins.
 
         This records the html_url, as well as the pushed_at value if it exists.
         """
         assert self.lister_obj.id is not None
 
         seen_in_page: Set[str] = set()
 
         for repo in page:
             if not repo:
                 # null repositories in listings happen sometimes...
                 continue
 
             if repo["html_url"] in seen_in_page:
                 continue
             seen_in_page.add(repo["html_url"])
 
             pushed_at_str = repo.get("pushed_at")
             pushed_at: Optional[datetime.datetime] = None
             if pushed_at_str:
                 pushed_at = iso8601.parse_date(pushed_at_str)
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=repo["html_url"],
                 visit_type="git",
                 last_update=pushed_at,
             )
 
     def commit_page(self, page: List[Dict[str, Any]]):
         """Update the currently stored state using the latest listed page"""
         if self.relisting:
             # Don't update internal state when relisting
             return
 
         if not page:
             # Sometimes, when you reach the end of the world, GitHub returns an empty
             # page of repositories
             return
 
         last_id = page[-1]["id"]
 
         if last_id > self.state.last_seen_id:
             self.state.last_seen_id = last_id
 
     def finalize(self):
         if self.relisting:
             return
 
         # Pull fresh lister state from the scheduler backend
         scheduler_state = self.get_state_from_scheduler()
 
         # Update the lister state in the backend only if the last seen id of
         # the current run is higher than that stored in the database.
         if self.state.last_seen_id > scheduler_state.last_seen_id:
             self.updated = True
diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py
index 2c874ae..88c5bf4 100644
--- a/swh/lister/github/tests/test_lister.py
+++ b/swh/lister/github/tests/test_lister.py
@@ -1,418 +1,245 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 import logging
-import time
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterator, List
 
 import pytest
 import requests_mock
 
+from swh.core.github.pytest_plugin import github_response_callback
 from swh.lister.github.lister import GitHubLister
 from swh.lister.pattern import CredentialsType, ListerStats
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import Lister
 
 NUM_PAGES = 10
 ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES
 
 
-def github_repo(i: int) -> Dict[str, Union[int, str]]:
-    """Basic repository information returned by the GitHub API"""
-
-    repo: Dict[str, Union[int, str]] = {
-        "id": i,
-        "html_url": f"https://github.com/origin/{i}",
-    }
-
-    # Set the pushed_at date on one of the origins
-    if i == 4321:
-        repo["pushed_at"] = "2018-11-08T13:16:24Z"
-
-    return repo
-
-
-def github_response_callback(
-    request: requests_mock.request._RequestObjectProxy,
-    context: requests_mock.response._Context,
-) -> List[Dict[str, Union[str, int]]]:
-    """Return minimal GitHub API responses for the common case where the loader
-    hasn't been rate-limited"""
-    # Check request headers
-    assert request.headers["Accept"] == "application/vnd.github.v3+json"
-    assert "Software Heritage Lister" in request.headers["User-Agent"]
-
-    # Check request parameters: per_page == 1000, since = last_repo_id
-    assert "per_page" in request.qs
-    assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)]
-    assert "since" in request.qs
-
-    since = int(request.qs["since"][0])
-
-    next_page = since + GitHubLister.PAGE_SIZE
-    if next_page < ORIGIN_COUNT:
-        # the first id for the next page is within our origin count; add a Link
-        # header to the response
-        next_url = (
-            GitHubLister.API_URL
-            + f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}"
-        )
-        context.headers["Link"] = f"<{next_url}>; rel=next"
-
-    return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)]
-
-
 @pytest.fixture()
 def requests_mocker() -> Iterator[requests_mock.Mocker]:
     with requests_mock.Mocker() as mock:
         mock.get(GitHubLister.API_URL, json=github_response_callback)
         yield mock
 
 
 def get_lister_data(swh_scheduler: SchedulerInterface) -> Lister:
     """Retrieve the data for the GitHub Lister"""
     return swh_scheduler.get_or_create_lister(name="github", instance_name="github")
 
 
 def set_lister_state(swh_scheduler: SchedulerInterface, state: Dict[str, Any]) -> None:
     """Set the state of the lister in database"""
     lister = swh_scheduler.get_or_create_lister(name="github", instance_name="github")
     lister.current_state = state
     swh_scheduler.update_lister(lister)
 
 
 def check_origin_4321(swh_scheduler: SchedulerInterface, lister: Lister) -> None:
     """Check that origin 4321 exists and has the proper last_update timestamp"""
     origin_4321_req = swh_scheduler.get_listed_origins(
         url="https://github.com/origin/4321"
     )
     assert len(origin_4321_req.results) == 1
     origin_4321 = origin_4321_req.results[0]
     assert origin_4321.lister_id == lister.id
     assert origin_4321.visit_type == "git"
     assert origin_4321.last_update == datetime.datetime(
         2018, 11, 8, 13, 16, 24, tzinfo=datetime.timezone.utc
     )
 
 
 def check_origin_5555(swh_scheduler: SchedulerInterface, lister: Lister) -> None:
     """Check that origin 5555 exists and has no last_update timestamp"""
     origin_5555_req = swh_scheduler.get_listed_origins(
         url="https://github.com/origin/5555"
     )
     assert len(origin_5555_req.results) == 1
     origin_5555 = origin_5555_req.results[0]
     assert origin_5555.lister_id == lister.id
     assert origin_5555.visit_type == "git"
     assert origin_5555.last_update is None
 
 
 def test_from_empty_state(
     swh_scheduler, caplog, requests_mocker: requests_mock.Mocker
 ) -> None:
     caplog.set_level(logging.DEBUG, "swh.lister.github.lister")
 
     # Run the lister in incremental mode
     lister = GitHubLister(scheduler=swh_scheduler)
     res = lister.run()
 
     assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)
 
     listed_origins = swh_scheduler.get_listed_origins(limit=ORIGIN_COUNT + 1)
     assert len(listed_origins.results) == ORIGIN_COUNT
     assert listed_origins.next_page_token is None
 
     lister_data = get_lister_data(swh_scheduler)
     assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT}
 
     check_origin_4321(swh_scheduler, lister_data)
     check_origin_5555(swh_scheduler, lister_data)
 
 
 def test_incremental(swh_scheduler, caplog, requests_mocker) -> None:
     caplog.set_level(logging.DEBUG, "swh.lister.github.lister")
 
     # Number of origins to skip
     skip_origins = 2000
     expected_origins = ORIGIN_COUNT - skip_origins
 
     # Bump the last_seen_id in the scheduler backend
     set_lister_state(swh_scheduler, {"last_seen_id": skip_origins})
 
     # Run the lister in incremental mode
     lister = GitHubLister(scheduler=swh_scheduler)
     res = lister.run()
 
     # add 1 page to the number of full_pages if partial_page_len is not 0
     full_pages, partial_page_len = divmod(expected_origins, GitHubLister.PAGE_SIZE)
     expected_pages = full_pages + bool(partial_page_len)
 
     assert res == ListerStats(pages=expected_pages, origins=expected_origins)
 
     listed_origins = swh_scheduler.get_listed_origins(limit=expected_origins + 1)
     assert len(listed_origins.results) == expected_origins
     assert listed_origins.next_page_token is None
 
     lister_data = get_lister_data(swh_scheduler)
     assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT}
 
     check_origin_4321(swh_scheduler, lister_data)
     check_origin_5555(swh_scheduler, lister_data)
 
 
 def test_relister(swh_scheduler, caplog, requests_mocker) -> None:
     caplog.set_level(logging.DEBUG, "swh.lister.github.lister")
 
     # Only set this state as a canary: in the currently tested mode, the lister
     # should not be touching it.
     set_lister_state(swh_scheduler, {"last_seen_id": 123})
 
     # Use "relisting" mode to list origins between id 10 and 1011
     lister = GitHubLister(scheduler=swh_scheduler, first_id=10, last_id=1011)
     res = lister.run()
 
     # Make sure we got two full pages of results
     assert res == ListerStats(pages=2, origins=2000)
 
     # Check that the relisting mode hasn't touched the stored state.
     lister_data = get_lister_data(swh_scheduler)
     assert lister_data.current_state == {"last_seen_id": 123}
 
 
-def github_ratelimit_callback(
-    request: requests_mock.request._RequestObjectProxy,
-    context: requests_mock.response._Context,
-    ratelimit_reset: Optional[int],
-) -> Dict[str, str]:
-    """Return a rate-limited GitHub API response."""
-    # Check request headers
-    assert request.headers["Accept"] == "application/vnd.github.v3+json"
-    assert "Software Heritage Lister" in request.headers["User-Agent"]
-    if "Authorization" in request.headers:
-        context.status_code = 429
-    else:
-        context.status_code = 403
-
-    if ratelimit_reset is not None:
-        context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset)
-
-    return {
-        "message": "API rate limit exceeded for <IP>.",
-        "documentation_url": "https://developer.github.com/v3/#rate-limiting",
-    }
-
-
-@pytest.fixture()
-def num_before_ratelimit() -> int:
-    """Number of successful requests before the ratelimit hits"""
-    return 0
-
-
-@pytest.fixture()
-def num_ratelimit() -> Optional[int]:
-    """Number of rate-limited requests; None means infinity"""
-    return None
-
-
-@pytest.fixture()
-def ratelimit_reset() -> Optional[int]:
-    """Value of the X-Ratelimit-Reset header on ratelimited responses"""
-    return None
-
-
-@pytest.fixture()
-def requests_ratelimited(
-    num_before_ratelimit: int,
-    num_ratelimit: Optional[int],
-    ratelimit_reset: Optional[int],
-) -> Iterator[requests_mock.Mocker]:
-    """Mock requests to the GitHub API, returning a rate-limiting status code
-    after `num_before_ratelimit` requests.
-
-    GitHub does inconsistent rate-limiting:
-      - Anonymous requests return a 403 status code
-      - Authenticated requests return a 429 status code, with an
-        X-Ratelimit-Reset header.
-
-    This fixture takes multiple arguments (which can be overridden with a
-    :func:`pytest.mark.parametrize` parameter):
-     - num_before_ratelimit: the global number of requests until the
-       ratelimit triggers
-     - num_ratelimit: the number of requests that return a
-       rate-limited response.
-     - ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the
-       request is authenticated.
-
-    The default values set in the previous fixtures make all requests return a rate
-    limit response.
-    """
-    current_request = 0
-
-    def response_callback(request, context):
-        nonlocal current_request
-        current_request += 1
-        if num_before_ratelimit < current_request and (
-            num_ratelimit is None
-            or current_request < num_before_ratelimit + num_ratelimit + 1
-        ):
-            return github_ratelimit_callback(request, context, ratelimit_reset)
-        else:
-            return github_response_callback(request, context)
-
-    with requests_mock.Mocker() as mock:
-        mock.get(GitHubLister.API_URL, json=response_callback)
-        yield mock
-
-
 def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None:
-    caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
+    caplog.set_level(logging.DEBUG, "swh.core.github.utils")
 
     lister = GitHubLister(scheduler=swh_scheduler)
     assert lister.github_session.anonymous
     assert "using anonymous mode" in caplog.records[-1].message
     caplog.clear()
 
     res = lister.run()
     assert res == ListerStats(pages=0, origins=0)
 
     last_log = caplog.records[-1]
     assert last_log.levelname == "WARNING"
     assert "No X-Ratelimit-Reset value found in responses" in last_log.message
 
 
-@pytest.fixture
-def github_credentials() -> List[Dict[str, str]]:
-    """Return a static list of GitHub credentials"""
-    return sorted(
-        [{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)]
-        + [
-            {"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"}
-            for i in range(3)
-        ],
-        key=lambda c: c["username"],
-    )
-
-
-@pytest.fixture
-def all_tokens(github_credentials) -> List[str]:
-    """Return the list of tokens matching the static credential"""
-
-    return [t.get("token", t.get("password")) for t in github_credentials]
-
-
 @pytest.fixture
 def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType:
     """Return the credentials formatted for use by the lister"""
     return {"github": {"github": github_credentials}}
 
 
 def test_authenticated_credentials(
     swh_scheduler, caplog, github_credentials, lister_credentials, all_tokens
 ):
     """Test credentials management when the lister is authenticated"""
     caplog.set_level(logging.DEBUG, "swh.lister.github.lister")
 
     lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
     assert lister.github_session.token_index == 0
     assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials
     assert lister.github_session.session.headers["Authorization"] in [
         "token %s" % t for t in all_tokens
     ]
 
 
-def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None):
-    """Record calls to time.sleep in the sleep_calls list"""
-    if duration < 0:
-        raise ValueError("Can't sleep for a negative amount of time!")
-    if sleep_calls is not None:
-        sleep_calls.append(duration)
-
-
-def fake_time_time():
-    """Return 0 when running time.time()"""
-    return 0
-
-
-@pytest.fixture
-def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]:
-    """Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments
-    passed to time.sleep()."""
-    sleeps: List[float] = []
-    monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps))
-    monkeypatch.setattr(time, "time", fake_time_time)
-    yield sleeps
-
-
 @pytest.mark.parametrize(
     "num_ratelimit", [1]
 )  # return a single rate-limit response, then continue
 def test_ratelimit_once_recovery(
     swh_scheduler,
     caplog,
     requests_ratelimited,
     num_ratelimit,
     monkeypatch_sleep_calls,
     lister_credentials,
 ):
     """Check that the lister recovers from hitting the rate-limit once"""
-    caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
+    caplog.set_level(logging.DEBUG, "swh.core.github.utils")
 
     lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
 
     res = lister.run()
     # check that we used all the pages
     assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)
 
     token_users = []
     for record in caplog.records:
         if "Using authentication token" in record.message:
             token_users.append(record.args[0])
 
     # check that we used one more token than we saw rate limited requests
     assert len(token_users) == 1 + num_ratelimit
 
     # check that we slept for one second between our token uses
     assert monkeypatch_sleep_calls == [1]
 
 
 @pytest.mark.parametrize(
     # Do 5 successful requests, return 6 ratelimits (to exhaust the credentials) with a
     # set value for X-Ratelimit-Reset, then resume listing successfully.
     "num_before_ratelimit, num_ratelimit, ratelimit_reset",
     [(5, 6, 123456)],
 )
 def test_ratelimit_reset_sleep(
     swh_scheduler,
     caplog,
     requests_ratelimited,
     monkeypatch_sleep_calls,
     num_before_ratelimit,
     ratelimit_reset,
     github_credentials,
     lister_credentials,
 ):
     """Check that the lister properly handles rate-limiting when providing it with
     authentication tokens"""
-    caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
+    caplog.set_level(logging.DEBUG, "swh.core.github.utils")
 
     lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
 
     res = lister.run()
     assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)
 
     # We sleep 1 second every time we change credentials, then we sleep until
     # ratelimit_reset + 1
     expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1]
     assert monkeypatch_sleep_calls == expected_sleep_calls
 
     found_exhaustion_message = False
     for record in caplog.records:
         if record.levelname == "INFO":
             if "Rate limits exhausted for all tokens" in record.message:
                 found_exhaustion_message = True
                 break
 
     assert found_exhaustion_message
diff --git a/swh/lister/github/utils.py b/swh/lister/github/utils.py
index 269d432..df6088e 100644
--- a/swh/lister/github/utils.py
+++ b/swh/lister/github/utils.py
@@ -1,170 +1,6 @@
 # Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import logging
-import random
-import time
-from typing import Dict, List, Optional
-
-import requests
-from tenacity import (
-    retry,
-    retry_any,
-    retry_if_exception_type,
-    retry_if_result,
-    wait_exponential,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class RateLimited(Exception):
-    def __init__(self, response):
-        self.reset_time: Optional[int]
-
-        # Figure out how long we need to sleep because of that rate limit
-        ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
-        retry_after = response.headers.get("Retry-After")
-        if ratelimit_reset is not None:
-            self.reset_time = int(ratelimit_reset)
-        elif retry_after is not None:
-            self.reset_time = int(time.time()) + int(retry_after) + 1
-        else:
-            logger.warning(
-                "Received a rate-limit-like status code %s, but no rate-limit "
-                "headers set. Response content: %s",
-                response.status_code,
-                response.content,
-            )
-            self.reset_time = None
-        self.response = response
-
-
-class MissingRateLimitReset(Exception):
-    pass
-
-
-class GitHubSession:
-    """Manages a :class:`requests.Session` with (optionally) multiple credentials,
-    and cycles through them when reaching rate-limits."""
-
-    def __init__(
-        self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None
-    ) -> None:
-        """Initialize a requests session with the proper headers for requests to
-        GitHub."""
-        self.credentials = credentials
-        if self.credentials:
-            random.shuffle(self.credentials)
-
-        self.session = requests.Session()
-
-        self.session.headers.update(
-            {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent}
-        )
-
-        self.anonymous = not self.credentials
-
-        if self.anonymous:
-            logger.warning("No tokens set in configuration, using anonymous mode")
-
-        self.token_index = -1
-        self.current_user: Optional[str] = None
-
-        if not self.anonymous:
-            # Initialize the first token value in the session headers
-            self.set_next_session_token()
-
-    def set_next_session_token(self) -> None:
-        """Update the current authentication token with the next one in line."""
-
-        assert self.credentials
-
-        self.token_index = (self.token_index + 1) % len(self.credentials)
-
-        auth = self.credentials[self.token_index]
-
-        self.current_user = auth["username"]
-        logger.debug("Using authentication token for user %s", self.current_user)
-
-        if "password" in auth:
-            token = auth["password"]
-        else:
-            token = auth["token"]
-
-        self.session.headers.update({"Authorization": f"token {token}"})
-
-    @retry(
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_any(
-            # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
-            # when running the lister on a connection with high latency
-            retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
-            # 502 status codes happen for a Server Error, sometimes
-            retry_if_result(lambda r: r.status_code == 502),
-        ),
-    )
-    def _request(self, url: str) -> requests.Response:
-        response = self.session.get(url)
-
-        if (
-            # GitHub returns inconsistent status codes between unauthenticated
-            # rate limit and authenticated rate limits. Handle both.
-            response.status_code == 429
-            or (self.anonymous and response.status_code == 403)
-        ):
-            raise RateLimited(response)
-
-        return response
-
-    def request(self, url) -> requests.Response:
-        """Repeatedly requests the given URL, cycling through credentials and sleeping
-        if necessary; until either a successful response or :exc:`MissingRateLimitReset`
-        """
-        # The following for/else loop handles rate limiting; if successful,
-        # it provides the rest of the function with a `response` object.
-        #
-        # If all tokens are rate-limited, we sleep until the reset time,
-        # then `continue` into another iteration of the outer while loop,
-        # attempting to get data from the same URL again.
-
-        while True:
-            max_attempts = len(self.credentials) if self.credentials else 1
-            reset_times: Dict[int, int] = {}  # token index -> time
-            for attempt in range(max_attempts):
-                try:
-                    return self._request(url)
-                except RateLimited as e:
-                    reset_info = "(unknown reset)"
-                    if e.reset_time is not None:
-                        reset_times[self.token_index] = e.reset_time
-                        reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
-
-                    if not self.anonymous:
-                        logger.info(
-                            "Rate limit exhausted for current user %s %s",
-                            self.current_user,
-                            reset_info,
-                        )
-                        # Use next token in line
-                        self.set_next_session_token()
-                        # Wait one second to avoid triggering GitHub's abuse rate limits
-                        time.sleep(1)
-
-            # All tokens have been rate-limited. What do we do?
-
-            if not reset_times:
-                logger.warning(
-                    "No X-Ratelimit-Reset value found in responses for any token; "
-                    "Giving up."
-                )
-                raise MissingRateLimitReset()
-
-            sleep_time = max(reset_times.values()) - time.time() + 1
-            logger.info(
-                "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
-                sleep_time,
-            )
-            time.sleep(sleep_time)
+from swh.core.github.utils import *  # noqa
diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py
index 4ccc532..2dc6cc5 100644
--- a/swh/lister/maven/lister.py
+++ b/swh/lister/maven/lister.py
@@ -1,390 +1,425 @@
 # Copyright (C) 2021-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 from datetime import datetime, timezone
 import logging
 import re
 from typing import Any, Dict, Iterator, Optional
 from urllib.parse import urljoin
 
 import requests
 from tenacity.before_sleep import before_sleep_log
 import xmltodict
 
+from swh.core.github.utils import GitHubSession
 from swh.lister.utils import throttling_retry
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 RepoPage = Dict[str, Any]
 
+SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr")
+
 
 @dataclass
 class MavenListerState:
     """State of the MavenLister"""
 
     last_seen_doc: int = -1
     """Last doc ID ingested during an incremental pass
 
     """
 
     last_seen_pom: int = -1
     """Last doc ID related to a pom and ingested during
        an incremental pass
 
     """
 
 
 class MavenLister(Lister[MavenListerState, RepoPage]):
     """List origins from a Maven repository.
 
     Maven Central provides artifacts for Java builds.
     It includes POM files and source archives, which we download to get
     the source code of artifacts and links to their scm repository.
 
     This lister yields origins of types: git/svn/hg or whatever the Artifacts
     use as repository type, plus maven types for the maven loader (tgz, jar)."""
 
     LISTER_NAME = "maven"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         url: str,
         index_url: str = None,
         instance: Optional[str] = None,
         credentials: CredentialsType = None,
         incremental: bool = True,
     ):
         """Lister class for Maven repositories.
 
         Args:
             url: main URL of the Maven repository, i.e. url of the base index
                 used to fetch maven artifacts. For Maven central use
                 https://repo1.maven.org/maven2/
             index_url: the URL to download the exported text indexes from.
                 Would typically be a local host running the export docker image.
                 See README.md in this directory for more information.
             instance: Name of maven instance. Defaults to url's network location
                 if unset.
             incremental: bool, defaults to True. Defines if incremental listing
                 is activated or not.
 
         """
         self.BASE_URL = url
         self.INDEX_URL = index_url
         self.incremental = incremental
 
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=url,
             instance=instance,
         )
 
         self.session = requests.Session()
         self.session.headers.update(
             {
                 "Accept": "application/json",
                 "User-Agent": USER_AGENT,
             }
         )
 
         self.jar_origins: Dict[str, ListedOrigin] = {}
+        self.github_session = GitHubSession(
+            credentials=self.credentials, user_agent=USER_AGENT
+        )
 
     def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
         return MavenListerState(**d)
 
     def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
         return asdict(state)
 
     @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
     def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
 
         logger.info("Fetching URL %s with params %s", url, params)
 
         response = self.session.get(url, params=params)
         if response.status_code != 200:
             logger.warning(
                 "Unexpected HTTP status code %s on %s: %s",
                 response.status_code,
                 response.url,
                 response.content,
             )
         response.raise_for_status()
 
         return response
 
     def get_pages(self) -> Iterator[RepoPage]:
         """Retrieve and parse exported maven indexes to
         identify all pom files and src archives.
         """
 
         # Example of returned RepoPage's:
         # [
         #   {
         #     "type": "maven",
         #     "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
         #     "time": 1626109619335,
         #     "gid": "org.xwiki.platform",
         #     "aid": "xwiki-platform-wikistream-events-xwiki",
         #     "version": "5.4.2"
         #   },
         #   {
         #     "type": "scm",
         #     "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
         #     "project": "openengsb-framework",
         #   },
         #   ...
         # ]
 
         # Download the main text index file.
         logger.info("Downloading computed index from %s.", self.INDEX_URL)
         assert self.INDEX_URL is not None
         response = requests.get(self.INDEX_URL, stream=True)
         if response.status_code != 200:
             logger.error("Index %s not found, stopping", self.INDEX_URL)
             response.raise_for_status()
 
         # Prepare regexes to parse index exports.
 
         # Parse doc id.
         # Example line: "doc 13"
         re_doc = re.compile(r"^doc (?P<doc>\d+)$")
 
         # Parse gid, aid, version, classifier, extension.
         # Example line: "    value al.aldi|sprova4j|0.1.0|sources|jar"
         re_val = re.compile(
             r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"
             + r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"
         )
 
         # Parse last modification time.
         # Example line: "    value jar|1626109619335|14316|2|2|0|jar"
         re_time = re.compile(
             r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
             + r"\|([^|]+)\|([^|]+)$"
         )
 
         # Read file line by line and process it
         out_pom: Dict = {}
         jar_src: Dict = {}
         doc_id: int = 0
         jar_src["doc"] = None
         url_src = None
 
         iterator = response.iter_lines(chunk_size=1024)
         for line_bytes in iterator:
             # Read the index text export and get URLs and SCMs.
             line = line_bytes.decode(errors="ignore")
             m_doc = re_doc.match(line)
             if m_doc is not None:
                 doc_id = int(m_doc.group("doc"))
                 # jar_src["doc"] contains the id of the current document, whatever
                 # its type (scm or jar).
                 jar_src["doc"] = doc_id
             else:
                 m_val = re_val.match(line)
                 if m_val is not None:
                     (gid, aid, version, classifier, ext) = m_val.groups()
                     ext = ext.strip()
                     path = "/".join(gid.split("."))
                     if classifier == "NA" and ext.lower() == "pom":
                         # If incremental mode, we don't record any line that is
                         # before our last recorded doc id.
                         if (
                             self.incremental
                             and self.state
                             and self.state.last_seen_pom
                             and self.state.last_seen_pom >= doc_id
                         ):
                             continue
                         url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"
                         url_pom = urljoin(
                             self.BASE_URL,
                             url_path,
                         )
                         out_pom[url_pom] = doc_id
                     elif (
                         classifier.lower() == "sources" or ("src" in classifier)
                     ) and ext.lower() in ("zip", "jar"):
                         url_path = (
                             f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"
                         )
                         url_src = urljoin(self.BASE_URL, url_path)
                         jar_src["gid"] = gid
                         jar_src["aid"] = aid
                         jar_src["version"] = version
                 else:
                     m_time = re_time.match(line)
                     if m_time is not None and url_src is not None:
                         time = m_time.group("mtime")
                         jar_src["time"] = int(time)
                         artifact_metadata_d = {
                             "type": "maven",
                             "url": url_src,
                             **jar_src,
                         }
                         logger.debug(
                             "* Yielding jar %s: %s", url_src, artifact_metadata_d
                         )
                         yield artifact_metadata_d
                         url_src = None
 
         logger.info("Found %s poms.", len(out_pom))
 
         # Now fetch pom files and scan them for scm info.
 
         logger.info("Fetching poms..")
         for pom in out_pom:
             try:
                 response = self.page_request(pom, {})
                 project = xmltodict.parse(response.content)
                 project_d = project.get("project", {})
                 scm_d = project_d.get("scm")
                 if scm_d is not None:
                     connection = scm_d.get("connection")
                     if connection is not None:
                         artifact_metadata_d = {
                             "type": "scm",
                             "doc": out_pom[pom],
                             "url": connection,
                         }
                         logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
                         yield artifact_metadata_d
                     else:
                         logger.debug("No scm.connection in pom %s", pom)
                 else:
                     logger.debug("No scm in pom %s", pom)
             except requests.HTTPError:
                 logger.warning(
                     "POM info page could not be fetched, skipping project '%s'",
                     pom,
                 )
             except xmltodict.expat.ExpatError as error:
                 logger.info("Could not parse POM %s XML: %s. Next.", pom, error)
 
+    def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
+        """Retrieve scm origin out of the page information. Only called when type of the
+        page is scm.
+
+        Try and detect an scm/vcs repository. Note that official format is in the form:
+        scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
+        the repo url (without the "scm:type"), so we have to check against the content
+        to extract the type and url properly.
+
+        Raises
+            AssertionError when the type of the page is not 'scm'
+
+        Returns
+            ListedOrigin with proper canonical scm url (for github) if any is found,
+            None otherwise.
+
+        """
+
+        assert page["type"] == "scm"
+        visit_type: Optional[str] = None
+        url: Optional[str] = None
+        m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
+        if m_scm is None:
+            return None
+
+        scm_type = m_scm.group("type")
+        if scm_type and scm_type in SUPPORTED_SCM_TYPES:
+            url = m_scm.group("url")
+            visit_type = scm_type
+        elif page["url"].endswith(".git"):
+            url = page["url"].lstrip("scm:")
+            visit_type = "git"
+        else:
+            return None
+
+        if url and visit_type == "git":
+            # Non-github urls will be returned as is, github ones will be canonical ones
+            url = self.github_session.get_canonical_url(url)
+
+        if not url:
+            return None
+
+        assert visit_type is not None
+        assert self.lister_obj.id is not None
+        return ListedOrigin(
+            lister_id=self.lister_obj.id,
+            url=url,
+            visit_type=visit_type,
+        )
+
     def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
+
         """Convert a page of Maven repositories into a list of ListedOrigins."""
-        assert self.lister_obj.id is not None
-        scm_types_ok = ("git", "svn", "hg", "cvs", "bzr")
         if page["type"] == "scm":
-            # If origin is a scm url: detect scm type and yield.
-            # Note that the official format is:
-            # scm:git:git://github.com/openengsb/openengsb-framework.git
-            # but many, many projects directly put the repo url, so we have to
-            # detect the content to match it properly.
-            m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
-            if m_scm is not None:
-                scm_type = m_scm.group("type")
-                if scm_type in scm_types_ok:
-                    scm_url = m_scm.group("url")
-                    origin = ListedOrigin(
-                        lister_id=self.lister_obj.id,
-                        url=scm_url,
-                        visit_type=scm_type,
-                    )
-                    yield origin
-            else:
-                if page["url"].endswith(".git"):
-                    origin = ListedOrigin(
-                        lister_id=self.lister_obj.id,
-                        url=page["url"],
-                        visit_type="git",
-                    )
-                    yield origin
+            listed_origin = self.get_scm(page)
+            if listed_origin:
+                yield listed_origin
         else:
             # Origin is gathering source archives:
             last_update_dt = None
             last_update_iso = ""
             try:
                 last_update_seconds = str(page["time"])[:-3]
                 last_update_dt = datetime.fromtimestamp(int(last_update_seconds))
                 last_update_dt = last_update_dt.astimezone(timezone.utc)
             except (OverflowError, ValueError):
                 logger.warning("- Failed to convert datetime %s.", last_update_seconds)
             if last_update_dt:
                 last_update_iso = last_update_dt.isoformat()
 
             # Origin URL will target page holding sources for all versions of
             # an artifactId (package name) inside a groupId (namespace)
             path = "/".join(page["gid"].split("."))
             origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}")
 
             artifact = {
                 **{k: v for k, v in page.items() if k != "doc"},
                 "time": last_update_iso,
                 "base_url": self.BASE_URL,
             }
 
             if origin_url not in self.jar_origins:
                 # Create ListedOrigin instance if we did not see that origin yet
+                assert self.lister_obj.id is not None
                 jar_origin = ListedOrigin(
                     lister_id=self.lister_obj.id,
                     url=origin_url,
                     visit_type=page["type"],
                     last_update=last_update_dt,
                     extra_loader_arguments={"artifacts": [artifact]},
                 )
                 self.jar_origins[origin_url] = jar_origin
             else:
                 # Update list of source artifacts for that origin otherwise
                 jar_origin = self.jar_origins[origin_url]
                 artifacts = jar_origin.extra_loader_arguments["artifacts"]
                 if artifact not in artifacts:
                     artifacts.append(artifact)
 
             if (
                 jar_origin.last_update
                 and last_update_dt
                 and last_update_dt > jar_origin.last_update
             ):
                 jar_origin.last_update = last_update_dt
 
             if not self.incremental or (
                 self.state and page["doc"] > self.state.last_seen_doc
             ):
                 # Yield origin with updated source artifacts, multiple instances of
                 # ListedOrigin for the same origin URL but with different artifacts
                 # list will be sent to the scheduler but it will deduplicate them and
                 # take the latest one to upsert in database
                 yield jar_origin
 
     def commit_page(self, page: RepoPage) -> None:
         """Update currently stored state using the latest listed doc.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental and self.state:
             # We need to differentiate the two state counters according
             # to the type of origin.
             if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:
                 self.state.last_seen_doc = page["doc"]
             elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:
                 self.state.last_seen_doc = page["doc"]
                 self.state.last_seen_pom = page["doc"]
 
     def finalize(self) -> None:
         """Finalize the lister state, set update if any progress has been made.
 
         Note: this is a noop for full listing mode
 
         """
         if self.incremental and self.state:
             last_seen_doc = self.state.last_seen_doc
             last_seen_pom = self.state.last_seen_pom
 
             scheduler_state = self.get_state_from_scheduler()
             if last_seen_doc and last_seen_pom:
                 if (scheduler_state.last_seen_doc < last_seen_doc) or (
                     scheduler_state.last_seen_pom < last_seen_pom
                 ):
                     self.updated = True
diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py
index 331461e..b2a88f9 100644
--- a/swh/lister/maven/tests/test_lister.py
+++ b/swh/lister/maven/tests/test_lister.py
@@ -1,334 +1,353 @@
 # Copyright (C) 2021-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from pathlib import Path
 
 import iso8601
 import pytest
 import requests
 
 from swh.lister.maven.lister import MavenLister
 
 MVN_URL = "https://repo1.maven.org/maven2/"  # main maven repo url
 INDEX_URL = "http://indexes/export.fld"  # index directory url
 
 URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
 URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
 URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
 
-LIST_GIT = (
-    "git://github.com/aldialimucaj/sprova4j.git",
-    "https://github.com/aldialimucaj/sprova4j.git",
-)
 
-LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
+USER_REPO0 = "aldialimucaj/sprova4j"
+GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}"
+GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}"
+LIST_GIT = (GIT_REPO_URL0_HTTPS,)
+
+USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java"
+GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}"
+GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git"
+GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}"
+LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,)
 
 LIST_SRC = (MVN_URL + "al/aldi/sprova4j",)
 
 LIST_SRC_DATA = (
     {
         "type": "maven",
         "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
         + "/0.1.0/sprova4j-0.1.0-sources.jar",
         "time": "2021-07-12T17:06:59+00:00",
         "gid": "al.aldi",
         "aid": "sprova4j",
         "version": "0.1.0",
         "base_url": MVN_URL,
     },
     {
         "type": "maven",
         "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j"
         + "/0.1.1/sprova4j-0.1.1-sources.jar",
         "time": "2021-07-12T17:37:05+00:00",
         "gid": "al.aldi",
         "aid": "sprova4j",
         "version": "0.1.1",
         "base_url": MVN_URL,
     },
 )
 
 
 @pytest.fixture
 def maven_index_full(datadir) -> bytes:
     return Path(datadir, "http_indexes", "export_full.fld").read_bytes()
 
 
 @pytest.fixture
 def maven_index_incr_first(datadir) -> bytes:
     return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes()
 
 
 @pytest.fixture
 def maven_pom_1(datadir) -> bytes:
     return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes()
 
 
 @pytest.fixture
 def maven_index_null_mtime(datadir) -> bytes:
     return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes()
 
 
 @pytest.fixture
 def maven_pom_1_malformed(datadir) -> bytes:
     return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes()
 
 
 @pytest.fixture
 def maven_pom_2(datadir) -> bytes:
     return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes()
 
 
 @pytest.fixture
 def maven_pom_3(datadir) -> bytes:
     return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
 
 
+@pytest.fixture
+def requests_mock(requests_mock):
+    """If github api calls for the configured scm repository, returns its canonical url."""
+    for url_api, url_html in [
+        (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS),
+        (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS),
+    ]:
+        requests_mock.get(
+            url_api,
+            json={"html_url": url_html},
+        )
+    yield requests_mock
+
+
 @pytest.fixture(autouse=True)
 def network_requests_mock(
     requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
 ):
     requests_mock.get(INDEX_URL, content=maven_index_full)
     requests_mock.get(URL_POM_1, content=maven_pom_1)
     requests_mock.get(URL_POM_2, content=maven_pom_2)
     requests_mock.get(URL_POM_3, content=maven_pom_3)
 
 
 def test_maven_full_listing(swh_scheduler):
     """Covers full listing of multiple pages, checking page results and listed
     origins, statelessness."""
 
     # Run the lister.
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=False,
     )
 
     stats = lister.run()
 
     # Start test checks.
     assert stats.pages == 5
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     origin_urls = [origin.url for origin in scheduler_origins]
 
     # 3 git origins + 1 maven origin with 2 releases (one per jar)
-    assert len(origin_urls) == 4
+    assert len(origin_urls) == 3
     assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
 
     for origin in scheduler_origins:
         if origin.visit_type == "maven":
             for src in LIST_SRC_DATA:
                 last_update_src = iso8601.parse_date(src["time"])
                 assert last_update_src <= origin.last_update
             assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
 
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == -1
     assert scheduler_state.last_seen_pom == -1
 
 
 def test_maven_full_listing_malformed(
     swh_scheduler,
     requests_mock,
     maven_pom_1_malformed,
 ):
     """Covers full listing of multiple pages, checking page results with a malformed
     scm entry in pom."""
 
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=False,
     )
 
     # Set up test.
     requests_mock.get(URL_POM_1, content=maven_pom_1_malformed)
 
     # Then run the lister.
     stats = lister.run()
 
     # Start test checks.
     assert stats.pages == 5
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     origin_urls = [origin.url for origin in scheduler_origins]
 
     # 2 git origins + 1 maven origin with 2 releases (one per jar)
     assert len(origin_urls) == 3
-    assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC)
+    assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
 
     for origin in scheduler_origins:
         if origin.visit_type == "maven":
             for src in LIST_SRC_DATA:
                 last_update_src = iso8601.parse_date(src["time"])
                 assert last_update_src <= origin.last_update
             assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
 
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == -1
     assert scheduler_state.last_seen_pom == -1
 
 
 def test_maven_incremental_listing(
     swh_scheduler,
     requests_mock,
     maven_index_full,
     maven_index_incr_first,
 ):
     """Covers full listing of multiple pages, checking page results and listed
     origins, with a second updated run for statefulness."""
 
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=True,
     )
 
     # Set up test.
     requests_mock.get(INDEX_URL, content=maven_index_incr_first)
 
     # Then run the lister.
     stats = lister.run()
 
     # Start test checks.
     assert lister.incremental
     assert lister.updated
     assert stats.pages == 2
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     origin_urls = [origin.url for origin in scheduler_origins]
 
     # 1 git origins + 1 maven origin with 1 release (one per jar)
     assert len(origin_urls) == 2
-    assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC)
+    assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC)
 
     for origin in scheduler_origins:
         if origin.visit_type == "maven":
             last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"])
             assert last_update_src == origin.last_update
             assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]]
 
     # Second execution of the lister, incremental mode
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=True,
     )
 
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == 1
     assert scheduler_state.last_seen_pom == 1
 
     # Set up test.
     requests_mock.get(INDEX_URL, content=maven_index_full)
 
     # Then run the lister.
     stats = lister.run()
 
     # Start test checks.
     assert lister.incremental
     assert lister.updated
     assert stats.pages == 4
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     origin_urls = [origin.url for origin in scheduler_origins]
 
     assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR)
 
     for origin in scheduler_origins:
         if origin.visit_type == "maven":
             for src in LIST_SRC_DATA:
                 last_update_src = iso8601.parse_date(src["time"])
                 assert last_update_src <= origin.last_update
             assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
 
     scheduler_state = lister.get_state_from_scheduler()
     assert scheduler_state is not None
     assert scheduler_state.last_seen_doc == 4
     assert scheduler_state.last_seen_pom == 4
 
 
 @pytest.mark.parametrize("http_code", [400, 404, 500, 502])
 def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code):
     """should stop listing if the lister fails to retrieve the main index url."""
 
     lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
     requests_mock.get(INDEX_URL, status_code=http_code)
     with pytest.raises(requests.HTTPError):  # listing cannot continues so stop
         lister.run()
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     assert len(scheduler_origins) == 0
 
 
 @pytest.mark.parametrize("http_code", [400, 404, 500, 502])
 def test_maven_list_http_error_artifacts(
     swh_scheduler,
     requests_mock,
     http_code,
 ):
     """should continue listing when failing to retrieve artifacts."""
     # Test failure of artefacts retrieval.
     requests_mock.get(URL_POM_1, status_code=http_code)
 
     lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
 
     # on artifacts though, that raises but continue listing
     lister.run()
 
     # If the maven_index_full step succeeded but not the get_pom step,
     # then we get only one maven-jar origin and one git origin.
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
-    assert len(scheduler_origins) == 3
+    assert len(scheduler_origins) == 2
 
 
 def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime):
 
     requests_mock.get(INDEX_URL, content=maven_index_null_mtime)
 
     # Run the lister.
     lister = MavenLister(
         scheduler=swh_scheduler,
         url=MVN_URL,
         instance="maven.org",
         index_url=INDEX_URL,
         incremental=False,
     )
 
     stats = lister.run()
 
     # Start test checks.
     assert stats.pages == 1
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     assert len(scheduler_origins) == 1
     assert scheduler_origins[0].last_update is None
 
 
 def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1):
     """should continue listing when failing to decode pom file."""
     # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one
     requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32"))
 
     lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL)
 
     lister.run()
 
     # If the maven_index_full step succeeded but not the pom parsing step,
     # then we get only one maven-jar origin and one git origin.
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
-    assert len(scheduler_origins) == 3
+    assert len(scheduler_origins) == 2