Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | |||||
import logging | import logging | ||||
from typing import Iterator, List, Optional | import re | ||||
from typing import Any, Dict, Iterator, List, Optional | |||||
from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | import requests | ||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||
from swh.lister.pattern import StatelessLister | from swh.lister.pattern import StatelessLister | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Repositories = List[str] | Repositories = List[Dict[str, Any]] | ||||
class CGitLister(StatelessLister[Repositories]): | class CGitLister(StatelessLister[Repositories]): | ||||
anlambert: You can remove this variable as it is no more used. | |||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
This lister will retrieve the list of published git repositories by | This lister will retrieve the list of published git repositories by | ||||
parsing the HTML page(s) of the index retrieved at `url`. | parsing the HTML page(s) of the index retrieved at `url`. | ||||
For each found git repository, a query is made at the given url found | For each found git repository, a query is made at the given url found | ||||
in this index to gather published "Clone" URLs to be used as origin | in this index to gather published "Clone" URLs to be used as origin | ||||
URL for that git repo. | URL for that git repo. | ||||
If several "Clone" urls are provided, prefer the http/https one, if | If several "Clone" urls are provided, prefer the http/https one, if | ||||
any, otherwise fallback to the first one. | any, otherwise fallback to the first one. | ||||
""" | """ | ||||
LISTER_NAME = "cgit" | LISTER_NAME = "cgit" | ||||
def __init__( | def __init__( | ||||
self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None | self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None | ||||
): | ): | ||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
Args: | Args: | ||||
url (str): main URL of the CGit instance, i.e. url of the index | url (str): main URL of the CGit instance, i.e. url of the index | ||||
of published git repositories on this instance. | of published git repositories on this instance. | ||||
instance (str): Name of cgit instance. Defaults to url's hostname | instance (str): Name of cgit instance. Defaults to url's hostname | ||||
if unset. | if unset. | ||||
""" | """ | ||||
if not instance: | if not instance: | ||||
instance = urlparse(url).hostname | instance = urlparse(url).hostname | ||||
assert instance is not None # Make mypy happy | assert instance is not None # Make mypy happy | ||||
Not Done Inline ActionsYou could simplify that function by directly using the datetime module from Python standard library. from datetime import datetime, timezone parsed_date = None for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"): try: parsed_date = datetime.strptime(date, date_format) # force UTC to avoid naive datetime if not parsed_date.tzinfo: parsed_date = parsed_date.replace(tzinfo=timezone.utc) break except Exception: pass return parsed_date The "%Y-%m-%d %H:%M:%S (%Z)" format will not create a timezone aware datetime anlambert: You could simplify that function by directly using the `datetime` module from Python standard… | |||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, credentials=None, url=url, instance=instance, | scheduler=scheduler, credentials=None, url=url, instance=instance, | ||||
) | ) | ||||
self.session = requests.Session() | self.session = requests.Session() | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | {"Accept": "application/html", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
def _get_and_parse(self, url: str) -> BeautifulSoup: | def _get_and_parse(self, url: str) -> BeautifulSoup: | ||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup""" | """Get the given url and parse the retrieved HTML using BeautifulSoup""" | ||||
response = self.session.get(url) | response = self.session.get(url) | ||||
response.raise_for_status() | response.raise_for_status() | ||||
return BeautifulSoup(response.text, features="html.parser") | return BeautifulSoup(response.text, features="html.parser") | ||||
def get_pages(self) -> Iterator[Repositories]: | def get_pages(self) -> Iterator[Repositories]: | ||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
The last_update date is retrieved on the list of repo page to avoid | |||||
to compute it on the repository details which only give a date per branch | |||||
""" | """ | ||||
next_page: Optional[str] = self.url | next_page: Optional[str] = self.url | ||||
while next_page: | while next_page: | ||||
bs_idx = self._get_and_parse(next_page) | bs_idx = self._get_and_parse(next_page) | ||||
page_results = [] | page_results = [] | ||||
for tr in bs_idx.find("div", {"class": "content"}).find_all( | for tr in bs_idx.find("div", {"class": "content"}).find_all( | ||||
"tr", {"class": ""} | "tr", {"class": ""} | ||||
): | ): | ||||
page_results.append(urljoin(self.url, tr.find("a")["href"])) | url = urljoin(self.url, tr.find("a")["href"]) | ||||
span = tr.find("span", {"class": re.compile("age-")}) | |||||
if span: | |||||
last_updated_date = span["title"] | |||||
else: | |||||
last_updated_date = None | |||||
page_results.append( | |||||
{"url": url, "last_updated_date": last_updated_date} | |||||
) | |||||
yield page_results | yield page_results | ||||
try: | try: | ||||
pager = bs_idx.find("ul", {"class": "pager"}) | pager = bs_idx.find("ul", {"class": "pager"}) | ||||
current_page = pager.find("a", {"class": "current"}) | current_page = pager.find("a", {"class": "current"}) | ||||
if current_page: | if current_page: | ||||
next_page = current_page.parent.next_sibling.a["href"] | next_page = current_page.parent.next_sibling.a["href"] | ||||
next_page = urljoin(self.url, next_page) | next_page = urljoin(self.url, next_page) | ||||
except (AttributeError, KeyError): | except (AttributeError, KeyError): | ||||
# no pager, or no next page | # no pager, or no next page | ||||
next_page = None | next_page = None | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, repositories: Repositories | self, repositories: Repositories | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
"""Convert a page of cgit repositories into a list of ListedOrigins.""" | """Convert a page of cgit repositories into a list of ListedOrigins.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for repository_url in repositories: | for repository in repositories: | ||||
origin_url = self._get_origin_from_repository_url(repository_url) | origin_url = self._get_origin_from_repository_url(repository["url"]) | ||||
if not origin_url: | if not origin_url: | ||||
continue | continue | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
url=origin_url, | url=origin_url, | ||||
visit_type="git", | visit_type="git", | ||||
last_update=None, | last_update=_parse_last_updated_date(repository), | ||||
) | ) | ||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | ||||
"""Extract the git url from the repository page""" | """Extract the git url from the repository page""" | ||||
bs = self._get_and_parse(repository_url) | bs = self._get_and_parse(repository_url) | ||||
# origin urls are listed on the repository page | # origin urls are listed on the repository page | ||||
# TODO check if forcing https is better or not ? | # TODO check if forcing https is better or not ? | ||||
Show All 9 Lines | def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | ||||
for url in urls: | for url in urls: | ||||
if urlparse(url).scheme in ("http", "https"): | if urlparse(url).scheme in ("http", "https"): | ||||
origin_url = url | origin_url = url | ||||
break | break | ||||
else: | else: | ||||
# otherwise, choose the first one | # otherwise, choose the first one | ||||
origin_url = urls[0] | origin_url = urls[0] | ||||
return origin_url | return origin_url | ||||
def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: | |||||
"""Parse the last updated date""" | |||||
date = repository.get("last_updated_date") | |||||
if not date: | |||||
return None | |||||
parsed_date = None | |||||
for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"): | |||||
try: | |||||
parsed_date = datetime.strptime(date, date_format) | |||||
# force UTC to avoid naive datetime | |||||
if not parsed_date.tzinfo: | |||||
parsed_date = parsed_date.replace(tzinfo=timezone.utc) | |||||
break | |||||
except Exception: | |||||
pass | |||||
if not parsed_date: | |||||
logger.warning( | |||||
"Could not parse %s last_updated date: %s", repository["url"], date, | |||||
) | |||||
return parsed_date |
You can remove this variable as it is no more used.