diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py
index bbb1f63..f4246a2 100644
--- a/swh/lister/github/lister.py
+++ b/swh/lister/github/lister.py
@@ -1,351 +1,357 @@
 # Copyright (C) 2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from dataclasses import asdict, dataclass
 import datetime
 import logging
 import random
 import time
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Set
 from urllib.parse import parse_qs, urlparse
 
 import iso8601
 import requests
 from tenacity import (
     retry,
     retry_any,
     retry_if_exception_type,
     retry_if_result,
     wait_exponential,
 )
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 
 def init_session(session: Optional[requests.Session] = None) -> requests.Session:
     """Initialize a requests session with the proper headers for requests to
     GitHub."""
     if not session:
         session = requests.Session()
 
     session.headers.update(
         {"Accept": "application/vnd.github.v3+json", "User-Agent": USER_AGENT}
     )
 
     return session
 
 
 class RateLimited(Exception):
     def __init__(self, response):
         self.reset_time: Optional[int]
 
         # Figure out how long we need to sleep because of that rate limit
         ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
         retry_after = response.headers.get("Retry-After")
         if ratelimit_reset is not None:
             self.reset_time = int(ratelimit_reset)
         elif retry_after is not None:
             self.reset_time = int(time.time()) + int(retry_after) + 1
         else:
             logger.warning(
                 "Received a rate-limit-like status code %s, but no rate-limit "
                 "headers set. Response content: %s",
                 response.status_code,
                 response.content,
             )
             self.reset_time = None
         self.response = response
 
 
 @retry(
     wait=wait_exponential(multiplier=1, min=4, max=10),
     retry=retry_any(
         # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
         # when running the lister on a connection with high latency
         retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
         # 502 status codes happen for a Server Error, sometimes
         retry_if_result(lambda r: r.status_code == 502),
     ),
 )
 def github_request(
     url: str, token: Optional[str] = None, session: Optional[requests.Session] = None
 ) -> requests.Response:
     session = init_session(session)
 
     headers = {}
     if token:
         headers["Authorization"] = f"token {token}"
 
     response = session.get(url, headers=headers)
 
     anonymous = token is None and "Authorization" not in session.headers
 
     if (
         # GitHub returns inconsistent status codes between unauthenticated
         # rate limit and authenticated rate limits. Handle both.
         response.status_code == 429
         or (anonymous and response.status_code == 403)
     ):
         raise RateLimited(response)
 
     return response
 
 
 @dataclass
 class GitHubListerState:
     """State of the GitHub lister"""
 
     last_seen_id: int = 0
     """Numeric id of the last repository listed on an incremental pass"""
 
 
 class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
     """List origins from GitHub.
 
     By default, the lister runs in incremental mode: it lists all repositories,
     starting with the `last_seen_id` stored in the scheduler backend.
 
     Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in
     that mode, the lister finds the origins present in the range **excluding**
     `first_id` and **including** `last_id`. In this mode, the lister can overrun the
     `last_id`: it will always record all the origins seen in a given page. As the lister
     is fully idempotent, this is not a practical problem. Once relisting completes, the
     lister state in the scheduler backend is not updated.
 
     When the config contains a set of credentials, we shuffle this list at the beginning
     of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the
     same token over and over again, until its rate limit runs out. Once that happens, we
     switch to the next token over in our shuffled list.
 
     When a request fails with a rate limit exception for all tokens, we pause the
     listing until the largest value for X-Ratelimit-Reset over all tokens.
 
     When the credentials aren't set in the lister config, the lister can run in
     anonymous mode too (e.g. for testing purposes).
 
     .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits
 
 
     Args:
       first_id: the id of the first repo to list
       last_id: stop listing after seeing a repo with an id higher than this value.
 
     """  # noqa: E501
 
     LISTER_NAME = "github"
 
     API_URL = "https://api.github.com/repositories"
     PAGE_SIZE = 1000
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: CredentialsType = None,
         first_id: Optional[int] = None,
         last_id: Optional[int] = None,
     ):
         super().__init__(
             scheduler=scheduler,
             credentials=credentials,
             url=self.API_URL,
             instance="github",
         )
 
         self.first_id = first_id
         self.last_id = last_id
 
         self.relisting = self.first_id is not None or self.last_id is not None
 
         self.session = init_session()
 
         random.shuffle(self.credentials)
 
         self.anonymous = not self.credentials
 
         if self.anonymous:
             logger.warning("No tokens set in configuration, using anonymous mode")
 
         self.token_index = -1
         self.current_user: Optional[str] = None
 
         if not self.anonymous:
             # Initialize the first token value in the session headers
             self.set_next_session_token()
 
     def set_next_session_token(self) -> None:
         """Update the current authentication token with the next one in line."""
 
         self.token_index = (self.token_index + 1) % len(self.credentials)
 
         auth = self.credentials[self.token_index]
         if "password" in auth:
             token = auth["password"]
         else:
             token = auth["token"]
 
         self.current_user = auth["username"]
         logger.debug("Using authentication token for user %s", self.current_user)
 
         self.session.headers.update({"Authorization": f"token {token}"})
 
     def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
         return GitHubListerState(**d)
 
     def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]:
         return asdict(state)
 
     def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
         current_id = 0
         if self.first_id is not None:
             current_id = self.first_id
         elif self.state is not None:
             current_id = self.state.last_seen_id
 
         current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}"
 
         while self.last_id is None or current_id < self.last_id:
             logger.debug("Getting page %s", current_url)
 
             # The following for/else loop handles rate limiting; if successful,
             # it provides the rest of the function with a `response` object.
             #
             # If all tokens are rate-limited, we sleep until the reset time,
             # then `continue` into another iteration of the outer while loop,
             # attempting to get data from the same URL again.
 
             max_attempts = 1 if self.anonymous else len(self.credentials)
             reset_times: Dict[int, int] = {}  # token index -> time
             for attempt in range(max_attempts):
                 try:
                     response = github_request(current_url, session=self.session)
                     break
                 except RateLimited as e:
                     reset_info = "(unknown reset)"
                     if e.reset_time is not None:
                         reset_times[self.token_index] = e.reset_time
                         reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
 
                     if not self.anonymous:
                         logger.info(
                             "Rate limit exhausted for current user %s %s",
                             self.current_user,
                             reset_info,
                         )
                         # Use next token in line
                         self.set_next_session_token()
                         # Wait one second to avoid triggering GitHub's abuse rate limits
                         time.sleep(1)
             else:
                 # All tokens have been rate-limited. What do we do?
 
                 if not reset_times:
                     logger.warning(
                         "No X-Ratelimit-Reset value found in responses for any token; "
                         "Giving up."
                     )
                     break
 
                 sleep_time = max(reset_times.values()) - time.time() + 1
                 logger.info(
                     "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
                     sleep_time,
                 )
                 time.sleep(sleep_time)
                 # This goes back to the outer page-by-page loop, doing one more
                 # iteration on the same page
                 continue
 
             # We've successfully retrieved a (non-ratelimited) `response`. We
             # still need to check it for validity.
 
             if response.status_code != 200:
                 logger.warning(
                     "Got unexpected status_code %s: %s",
                     response.status_code,
                     response.content,
                 )
                 break
 
             yield response.json()
 
             if "next" not in response.links:
                 # No `next` link, we've reached the end of the world
                 logger.debug(
                     "No next link found in the response headers, all caught up"
                 )
                 break
 
             # GitHub strongly advises to use the next link directly. We still
             # parse it to get the id of the last repository we've reached so
             # far.
             next_url = response.links["next"]["url"]
             parsed_url = urlparse(next_url)
             if not parsed_url.query:
                 logger.warning("Failed to parse url %s", next_url)
                 break
 
             parsed_query = parse_qs(parsed_url.query)
             current_id = int(parsed_query["since"][0])
             current_url = next_url
 
     def get_origins_from_page(
         self, page: List[Dict[str, Any]]
     ) -> Iterator[ListedOrigin]:
         """Convert a page of GitHub repositories into a list of ListedOrigins.
 
         This records the html_url, as well as the pushed_at value if it exists.
         """
         assert self.lister_obj.id is not None
 
+        seen_in_page: Set[str] = set()
+
         for repo in page:
             if not repo:
                 # null repositories in listings happen sometimes...
                 continue
 
+            if repo["html_url"] in seen_in_page:
+                continue
+            seen_in_page.add(repo["html_url"])
+
             pushed_at_str = repo.get("pushed_at")
             pushed_at: Optional[datetime.datetime] = None
             if pushed_at_str:
                 pushed_at = iso8601.parse_date(pushed_at_str)
 
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=repo["html_url"],
                 visit_type="git",
                 last_update=pushed_at,
             )
 
     def commit_page(self, page: List[Dict[str, Any]]):
         """Update the currently stored state using the latest listed page"""
         if self.relisting:
             # Don't update internal state when relisting
             return
 
         if not page:
             # Sometimes, when you reach the end of the world, GitHub returns an empty
             # page of repositories
             return
 
         last_id = page[-1]["id"]
 
         if last_id > self.state.last_seen_id:
             self.state.last_seen_id = last_id
 
     def finalize(self):
         if self.relisting:
             return
 
         # Pull fresh lister state from the scheduler backend
         scheduler_state = self.get_state_from_scheduler()
 
         # Update the lister state in the backend only if the last seen id of
         # the current run is higher than that stored in the database.
         if self.state.last_seen_id > scheduler_state.last_seen_id:
             self.updated = True