diff --git a/PKG-INFO b/PKG-INFO index 8d4b074..d077351 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,275 +1,275 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.7.0 +Version: 0.7.1 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` ## lister-cgit Once configured, you can execute a cgit lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cgit.tasks import cgit_lister logging.basicConfig(level=logging.DEBUG) # simple cgit instance cgit_lister(url='https://git.kernel.org/') # cgit instance whose listed repositories differ from the base url cgit_lister(url='https://cgit.kde.org/', url_prefix='https://anongit.kde.org/') ``` ## lister-packagist Once configured, you can execute a Packagist lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.packagist.tasks import packagist_lister logging.basicConfig(level=logging.DEBUG) packagist_lister() ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 8d4b074..d077351 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,275 +1,275 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.7.0 +Version: 0.7.1 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ``` ## lister-cran Once configured, you can execute a CRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` ## lister-cgit Once configured, you can execute a cgit lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cgit.tasks import cgit_lister logging.basicConfig(level=logging.DEBUG) # simple cgit instance cgit_lister(url='https://git.kernel.org/') # cgit instance whose listed repositories differ from the base url cgit_lister(url='https://cgit.kde.org/', url_prefix='https://anongit.kde.org/') ``` ## lister-packagist Once configured, you can execute a Packagist lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.packagist.tasks import packagist_lister logging.basicConfig(level=logging.DEBUG) packagist_lister() ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 646efa5..4f55e36 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,216 +1,216 @@ # Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup import requests from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) Repositories = List[Dict[str, Any]] class CGitLister(StatelessLister[Repositories]): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. The lister currently defines 2 listing behaviors: - If the `base_git_url` is provided, the listed origin urls are computed out of the base git url link and the one listed in the main listed page (resulting in less HTTP queries than the 2nd behavior below). This is expected to be the main deployed behavior. - Otherwise (with no `base_git_url`), for each found git repository listed, one extra HTTP query is made at the given url found in the main listing page to gather published "Clone" URLs to be used as origin URL for that git repo. If several "Clone" urls are provided, prefer the http/https one, if any, otherwise fallback to the first one. """ LISTER_NAME = "cgit" def __init__( self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, base_git_url: Optional[str] = None, ): """Lister class for CGit repositories. Args: url: main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. instance: Name of cgit instance. Defaults to url's hostname if unset. base_git_url: Optional base git url which allows the origin url computations. """ if not instance: instance = urlparse(url).hostname assert instance is not None # Make mypy happy super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/html", "User-Agent": USER_AGENT} ) self.base_git_url = base_git_url def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) response.raise_for_status() return BeautifulSoup(response.text, features="html.parser") def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current CGit server The last_update date is retrieved on the list of repo page to avoid to compute it on the repository details which only give a date per branch """ next_page: Optional[str] = self.url while next_page: bs_idx = self._get_and_parse(next_page) page_results = [] for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): repository_link = tr.find("a")["href"] repo_url = None git_url = None - base_url = urljoin(self.url, repository_link) + base_url = urljoin(self.url, repository_link).strip("/") if self.base_git_url: # mapping provided # computing git url git_url = base_url.replace(self.url, self.base_git_url) else: # we compute the git detailed page url from which we will retrieve # the git url (cf. self.get_origins_from_page) repo_url = base_url span = tr.find("span", {"class": re.compile("age-")}) if span: last_updated_date = span["title"] else: last_updated_date = None page_results.append( { "url": repo_url, "git_url": git_url, "last_updated_date": last_updated_date, } ) yield page_results try: pager = bs_idx.find("ul", {"class": "pager"}) current_page = pager.find("a", {"class": "current"}) if current_page: next_page = current_page.parent.next_sibling.a["href"] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page next_page = None def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of cgit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in repositories: origin_url = repo["git_url"] or self._get_origin_from_repository_url( repo["url"] ) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", last_update=_parse_last_updated_date(repo), ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" try: bs = self._get_and_parse(repository_url) except HTTPError as e: logger.warning( "Unexpected HTTP status code %s on %s", e.response.status_code, e.response.url, ) return None # origin urls are listed on the repository page # TODO check if forcing https is better or not ? # # # urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: return None # look for the http/https url, if any, and use it as origin_url for url in urls: if urlparse(url).scheme in ("http", "https"): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] return origin_url def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: """Parse the last updated date""" date = repository.get("last_updated_date") if not date: return None parsed_date = None for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"): try: parsed_date = datetime.strptime(date, date_format) # force UTC to avoid naive datetime if not parsed_date.tzinfo: parsed_date = parsed_date.replace(tzinfo=timezone.utc) break except Exception: pass if not parsed_date: logger.warning( "Could not parse %s last_updated date: %s", repository["url"], date, ) return parsed_date diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py index f963a4e..8f45846 100644 --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -1,233 +1,231 @@ # Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from typing import List import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.lister import __version__ from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date from swh.lister.pattern import ListerStats def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler): url = "https://git.savannah.gnu.org/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) repos: List[List[str]] = list(lister_cgit.get_pages()) flattened_repos = sum(repos, []) assert len(flattened_repos) == 977 - assert ( - flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" - ) + assert flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git" # note the url below is NOT a subpath of /cgit/ assert ( - flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git/" + flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git" ) # noqa # note the url below is NOT on the same server - assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git/" + assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git" def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler): url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) repos: List[List[str]] = list(lister_cgit.get_pages()) flattened_repos = sum(repos, []) # we should have 16 repos (listed on 3 pages) assert len(repos) == 3 assert len(flattened_repos) == 16 def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler): """cgit lister supports pagination""" url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 16 assert stats == ListerStats(pages=3, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith("https://git.tizen") # test user agent content assert len(requests_mock_datadir.request_history) != 0 for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] assert "Software Heritage Lister" in user_agent assert __version__ in user_agent def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler): """cgit lister returns last updated date""" url = "https://git.tizen/cgit" urls_without_date = [ f"https://git.tizen.org/cgit/{suffix_url}" for suffix_url in ["All-Projects", "All-Users", "Lock-Projects",] ] lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 16 assert stats == ListerStats(pages=3, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: if listed_origin.url in urls_without_date: assert listed_origin.last_update is None else: assert listed_origin.last_update is not None @pytest.mark.parametrize( "date_str,expected_date", [ ({}, None), ("unexpected date", None), ("2020-0140-10 10:10:10 (GMT)", None), ( "2020-01-10 10:10:10 (GMT)", datetime( year=2020, month=1, day=10, hour=10, minute=10, second=10, tzinfo=timezone.utc, ), ), ( "2019-08-04 05:10:41 +0100", datetime( year=2019, month=8, day=4, hour=5, minute=10, second=41, tzinfo=timezone(timedelta(hours=1)), ), ), ], ) def test_lister_cgit_date_parsing(date_str, expected_date): """test cgit lister date parsing""" repository = {"url": "url", "last_updated_date": date_str} assert _parse_last_updated_date(repository) == expected_date requests_mock_datadir_missing_url = requests_mock_datadir_factory( - ignore_urls=["https://git.tizen/cgit/adaptation/ap_samsung/audio-hal-e4x12/",] + ignore_urls=["https://git.tizen/cgit/adaptation/ap_samsung/audio-hal-e4x12",] ) def test_lister_cgit_get_origin_from_repo_failing( requests_mock_datadir_missing_url, swh_scheduler ): url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 15 assert stats == ListerStats(pages=3, origins=expected_nb_origins) @pytest.mark.parametrize( "credentials, expected_credentials", [ (None, []), ({"key": "value"}, []), ( {"cgit": {"tizen": [{"username": "user", "password": "pass"}]}}, [{"username": "user", "password": "pass"}], ), ], ) def test_lister_cgit_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): url = "https://git.tizen/cgit/" lister = CGitLister( swh_scheduler, url=url, instance="tizen", credentials=credentials ) # Credentials are allowed in constructor assert lister.credentials == expected_credentials def test_lister_cgit_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "url": "https://git.tizen/cgit/", "instance": "tizen", "credentials": {}, } lister = CGitLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None @pytest.mark.parametrize( "url,base_git_url,expected_nb_origins", [ ("https://git.eclipse.org/c", "https://eclipse.org/r", 5), ("https://git.baserock.org/cgit/", "https://git.baserock.org/git/", 3), ("https://jff.email/cgit/", "git://jff.email/opt/git/", 6), ], ) def test_lister_cgit_with_base_git_url( url, base_git_url, expected_nb_origins, requests_mock_datadir, swh_scheduler ): """With base git url provided, listed urls should be the computed origin urls """ lister_cgit = CGitLister(swh_scheduler, url=url, base_git_url=base_git_url,) stats = lister_cgit.run() assert stats == ListerStats(pages=1, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(base_git_url) assert ( listed_origin.url.startswith(url) is False ), f"url should be mapped to {base_git_url}"