Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import logging | import logging | ||||
import re | import re | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | import requests | ||||
from requests.exceptions import HTTPError | |||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||
from swh.lister.pattern import StatelessLister | from swh.lister.pattern import StatelessLister | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | class CGitLister(StatelessLister[Repositories]): | ||||
def get_pages(self) -> Iterator[Repositories]: | def get_pages(self) -> Iterator[Repositories]: | ||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
The last_update date is retrieved on the list of repo page to avoid | The last_update date is retrieved on the list of repo page to avoid | ||||
to compute it on the repository details which only give a date per branch | to compute it on the repository details which only give a date per branch | ||||
""" | """ | ||||
next_page: Optional[str] = self.url | next_page: Optional[str] = self.url | ||||
while next_page: | while next_page: | ||||
bs_idx = self._get_and_parse(next_page) | bs_idx = self._get_and_parse(next_page) | ||||
page_results = [] | page_results = [] | ||||
for tr in bs_idx.find("div", {"class": "content"}).find_all( | for tr in bs_idx.find("div", {"class": "content"}).find_all( | ||||
"tr", {"class": ""} | "tr", {"class": ""} | ||||
): | ): | ||||
url = urljoin(self.url, tr.find("a")["href"]) | url = urljoin(self.url, tr.find("a")["href"]) | ||||
span = tr.find("span", {"class": re.compile("age-")}) | span = tr.find("span", {"class": re.compile("age-")}) | ||||
if span: | if span: | ||||
Show All 21 Lines | class CGitLister(StatelessLister[Repositories]): | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, repositories: Repositories | self, repositories: Repositories | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
"""Convert a page of cgit repositories into a list of ListedOrigins.""" | """Convert a page of cgit repositories into a list of ListedOrigins.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for repository in repositories: | for repository in repositories: | ||||
origin_url = self._get_origin_from_repository_url(repository["url"]) | origin_url = self._get_origin_from_repository_url(repository["url"]) | ||||
if not origin_url: | if origin_url is None: | ||||
continue | continue | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
url=origin_url, | url=origin_url, | ||||
visit_type="git", | visit_type="git", | ||||
last_update=_parse_last_updated_date(repository), | last_update=_parse_last_updated_date(repository), | ||||
) | ) | ||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | ||||
"""Extract the git url from the repository page""" | """Extract the git url from the repository page""" | ||||
try: | |||||
bs = self._get_and_parse(repository_url) | bs = self._get_and_parse(repository_url) | ||||
except HTTPError as e: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s", | |||||
e.response.status_code, | |||||
e.response.url, | |||||
) | |||||
return None | |||||
# origin urls are listed on the repository page | # origin urls are listed on the repository page | ||||
# TODO check if forcing https is better or not ? | # TODO check if forcing https is better or not ? | ||||
# <link rel='vcs-git' href='git://...' title='...'/> | # <link rel='vcs-git' href='git://...' title='...'/> | ||||
# <link rel='vcs-git' href='http://...' title='...'/> | # <link rel='vcs-git' href='http://...' title='...'/> | ||||
# <link rel='vcs-git' href='https://...' title='...'/> | # <link rel='vcs-git' href='https://...' title='...'/> | ||||
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] | urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] | ||||
Show All 37 Lines |