diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -10,6 +10,7 @@ from bs4 import BeautifulSoup import requests +from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.pattern import StatelessLister @@ -76,6 +77,7 @@ next_page: Optional[str] = self.url while next_page: bs_idx = self._get_and_parse(next_page) + page_results = [] for tr in bs_idx.find("div", {"class": "content"}).find_all( @@ -113,7 +115,7 @@ for repository in repositories: origin_url = self._get_origin_from_repository_url(repository["url"]) - if not origin_url: + if origin_url is None: continue yield ListedOrigin( @@ -125,7 +127,15 @@ def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" - bs = self._get_and_parse(repository_url) + try: + bs = self._get_and_parse(repository_url) + except HTTPError as e: + logger.warning( + "Unexpected HTTP status code %s on %s", + e.response.status_code, + e.response.url, + ) + return None # origin urls are listed on the repository page # TODO check if forcing https is better or not ? diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -7,6 +7,7 @@ import pytest +from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.lister import __version__ from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date from swh.lister.pattern import ListerStats @@ -142,3 +143,20 @@ repository = {"url": "url", "last_updated_date": date_str} assert _parse_last_updated_date(repository) == expected_date + + +requests_mock_datadir_missing_url = requests_mock_datadir_factory( + ignore_urls=["https://git.tizen/cgit/adaptation/ap_samsung/audio-hal-e4x12/",] +) + + +def test_lister_cgit_get_origin_from_repo_failing( + requests_mock_datadir_missing_url, swh_scheduler +): + url = "https://git.tizen/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) + + stats = lister_cgit.run() + + expected_nb_origins = 15 + assert stats == ListerStats(pages=3, origins=expected_nb_origins)