diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -94,6 +94,9 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, flavours: Dict[str, Any] = { "official": { "archs": ["x86_64"], @@ -118,6 +121,9 @@ credentials=credentials, url=flavours["official"]["base_info_url"], instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.flavours = flavours diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -47,12 +47,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def download_packages_index(self) -> List[Dict[str, Any]]: diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -53,12 +53,18 @@ page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -30,12 +30,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -50,6 +50,9 @@ instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, base_git_url: Optional[str] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): """Lister class for CGit repositories. @@ -67,6 +70,9 @@ url=url, instance=instance, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/html"}) diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -41,12 +41,18 @@ url: str = BASE_REPO_URL, channel: str = "", archs: List = [], + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=url, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.channel: str = channel self.archs: List[str] = archs diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -81,12 +81,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list) diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -32,9 +32,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( - scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials + scheduler, + url=CRAN_MIRROR, + instance="cran", + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def get_pages(self) -> Iterator[PageType]: diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -66,12 +66,18 @@ self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.BASE_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.index_metadata: Dict[str, str] = {} diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -77,12 +77,18 @@ suites: List[Suite] = ["stretch", "buster", "bullseye"], components: List[Component] = ["main", "contrib", "non-free"], credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=mirror_url, instance=distribution, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # to ensure urljoin will produce valid Sources URL diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py --- a/swh/lister/fedora/lister.py +++ b/swh/lister/fedora/lister.py @@ -6,7 +6,7 @@ from dataclasses import dataclass, field from datetime import datetime, timezone import logging -from typing import Any, Dict, Iterator, List, Set, Type +from typing import Any, Dict, Iterator, List, Optional, Set, Type from urllib.error import HTTPError from urllib.parse import urljoin @@ -91,12 +91,18 @@ instance: str = "fedora", url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", releases: List[Release] = [34, 35, 36], + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=url, instance=instance, credentials={}, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.releases = releases diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -70,6 +70,9 @@ self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, first_id: Optional[int] = None, last_id: Optional[int] = None, ): @@ -79,6 +82,9 @@ url=self.API_URL, instance="github", with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.first_id = first_id diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -103,6 +103,9 @@ name: Optional[str] = "gitlab", instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, incremental: bool = False, ignored_project_prefixes: Optional[List[str]] = None, ): @@ -113,6 +116,9 @@ url=url.rstrip("/"), instance=instance, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.last_page: Optional[str] = None diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -31,12 +31,18 @@ self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.GNU_FTP_URL, instance="GNU", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # no side-effect calls in constructor, if extra state is needed, as preconized # by the pattern docstring, this must happen in the get_pages method. diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -75,12 +75,18 @@ api_token: Optional[str] = None, page_size: int = 50, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.query_params = { diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -47,12 +47,18 @@ scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.GOLANG_MODULES_INDEX_URL, instance=self.LISTER_NAME, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py --- a/swh/lister/hackage/lister.py +++ b/swh/lister/hackage/lister.py @@ -44,6 +44,9 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, url: Optional[str] = None, ): super().__init__( @@ -51,6 +54,9 @@ credentials=credentials, instance=self.INSTANCE, url=url if url else self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Ensure to set this with same value as the http api search endpoint use # (50 as of august 2022) diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -66,12 +66,18 @@ scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.date_last_modified: Dict[str, Optional[datetime]] = { diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -61,6 +61,9 @@ index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, incremental: bool = True, ): """Lister class for Maven repositories. @@ -88,6 +91,9 @@ url=url, instance=instance, with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -320,6 +320,9 @@ origin_upstream: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, # canonicalize urls, can be turned off during docker runs canonicalize: bool = True, extensions_to_ignore: List[str] = [], @@ -331,6 +334,9 @@ instance=instance, credentials=credentials, with_github_session=canonicalize, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -53,6 +53,9 @@ page_size: int = 1000, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, @@ -61,6 +64,9 @@ if incremental else self.API_FULL_LISTING_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.page_size = page_size diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -44,12 +44,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_INDEX_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.listing_date: Optional[datetime] = None diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py --- a/swh/lister/opam/lister.py +++ b/swh/lister/opam/lister.py @@ -45,6 +45,9 @@ url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, opam_root: str = "/tmp/opam/", ): super().__init__( @@ -52,6 +55,9 @@ credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.env = os.environ.copy() # Opam root folder is initialized in the :meth:`get_pages` method as no diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -53,6 +53,9 @@ self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, @@ -60,6 +63,9 @@ instance="packagist", credentials=credentials, with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -40,9 +40,18 @@ instance: Optional[str] = None, api_token: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( - scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials + scheduler=scheduler, + url=urljoin(url, self.API_REPOSITORY_PATH), + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -36,12 +36,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py --- a/swh/lister/puppet/lister.py +++ b/swh/lister/puppet/lister.py @@ -43,12 +43,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Store the datetime the lister runs for incremental purpose self.listing_date = datetime.now() diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -70,12 +70,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.PACKAGE_LIST_URL, instance=self.INSTANCE, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # used as termination condition and if useful, becomes the new state when the diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -63,12 +63,18 @@ self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def get_latest_dump_file(self) -> str: diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -113,12 +113,18 @@ scheduler: SchedulerInterface, incremental: bool = False, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Will hold the currently saved "last modified" dates to compare against our diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -45,12 +45,18 @@ url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"})