Changeset View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||||||
import logging | import logging | ||||||||||||||
import re | from typing import Iterator, List, Optional | ||||||||||||||
from typing import Any, Dict, Generator, Optional | |||||||||||||||
from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||||||||||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||||||||||||
from requests import Session | import requests | ||||||||||||||
from requests.adapters import HTTPAdapter | |||||||||||||||
from swh.core.utils import grouper | |||||||||||||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||||||||||||
from swh.lister.core.lister_base import ListerBase | from swh.lister.pattern import StatelessLister | ||||||||||||||
from swh.scheduler.interface import SchedulerInterface | |||||||||||||||
from .models import CGitModel | from swh.scheduler.model import ListedOrigin | ||||||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||||||||||
Repositories = List[str] | |||||||||||||||
class CGitLister(ListerBase): | class CGitLister(StatelessLister[Repositories]): | ||||||||||||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||||||||||||
This lister will retrieve the list of published git repositories by | This lister will retrieve the list of published git repositories by | ||||||||||||||
parsing the HTML page(s) of the index retrieved at `url`. | parsing the HTML page(s) of the index retrieved at `url`. | ||||||||||||||
For each found git repository, a query is made at the given url found | For each found git repository, a query is made at the given url found | ||||||||||||||
in this index to gather published "Clone" URLs to be used as origin | in this index to gather published "Clone" URLs to be used as origin | ||||||||||||||
URL for that git repo. | URL for that git repo. | ||||||||||||||
If several "Clone" urls are provided, prefer the http/https one, if | If several "Clone" urls are provided, prefer the http/https one, if | ||||||||||||||
any, otherwise fall bak to the first one. | any, otherwise fallback to the first one. | ||||||||||||||
A loader task is created for each git repository:: | |||||||||||||||
Task: | |||||||||||||||
Type: load-git | |||||||||||||||
Policy: recurring | |||||||||||||||
Args: | |||||||||||||||
<git_clonable_url> | |||||||||||||||
Example:: | |||||||||||||||
Task: | |||||||||||||||
Type: load-git | |||||||||||||||
Policy: recurring | |||||||||||||||
Args: | |||||||||||||||
'https://git.savannah.gnu.org/git/elisp-es.git' | |||||||||||||||
""" | """ | ||||||||||||||
MODEL = CGitModel | |||||||||||||||
DEFAULT_URL = "https://git.savannah.gnu.org/cgit/" | |||||||||||||||
LISTER_NAME = "cgit" | LISTER_NAME = "cgit" | ||||||||||||||
url_prefix_present = True | |||||||||||||||
def __init__(self, url=None, instance=None, override_config=None): | def __init__(self, scheduler: SchedulerInterface, url=None, instance=None): | ||||||||||||||
anlambert: missing types for `url` and `instance` parameters | |||||||||||||||
tenmaUnsubmitted Not Done Inline Actionslister url should not be optional, even if it is required in the task : it cannot function without one. tenma: lister url should not be optional, even if it is required in the task : it cannot function… | |||||||||||||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||||||||||||
Args: | Args: | ||||||||||||||
url (str): main URL of the CGit instance, i.e. url of the index | url (str): main URL of the CGit instance, i.e. url of the index | ||||||||||||||
of published git repositories on this instance. | of published git repositories on this instance. | ||||||||||||||
instance (str): Name of cgit instance. Defaults to url's hostname | instance (str): Name of cgit instance. Defaults to url's hostname | ||||||||||||||
if unset. | if unset. | ||||||||||||||
""" | """ | ||||||||||||||
super().__init__(override_config=override_config) | |||||||||||||||
if url is None: | |||||||||||||||
url = self.config.get("url", self.DEFAULT_URL) | |||||||||||||||
self.url = url | |||||||||||||||
if not instance: | if not instance: | ||||||||||||||
instance = urlparse(url).hostname | instance = urlparse(url).hostname | ||||||||||||||
self.instance = instance | |||||||||||||||
self.session = Session() | |||||||||||||||
self.session.mount(self.url, HTTPAdapter(max_retries=3)) | |||||||||||||||
self.session.headers = { | |||||||||||||||
"User-Agent": USER_AGENT, | |||||||||||||||
} | |||||||||||||||
def run(self) -> Dict[str, str]: | |||||||||||||||
status = "uneventful" | |||||||||||||||
total = 0 | |||||||||||||||
for repos in grouper(self.get_repos(), 10): | |||||||||||||||
models = list(filter(None, (self.build_model(repo) for repo in repos))) | |||||||||||||||
injected_repos = self.inject_repo_data_into_db(models) | |||||||||||||||
self.schedule_missing_tasks(models, injected_repos) | |||||||||||||||
self.db_session.commit() | |||||||||||||||
total += len(injected_repos) | |||||||||||||||
logger.debug("Scheduled %s tasks for %s", total, self.url) | |||||||||||||||
status = "eventful" | |||||||||||||||
return {"status": status} | super().__init__( | ||||||||||||||
Not Done Inline ActionsHere instance should not be equal to None as it is used to create the internal lister id. Two possible solutions to ensure it:
anlambert: Here `instance` should not be equal to `None` as it is used to create the internal lister id. | |||||||||||||||
Not Done Inline ActionsSorry, I did not see the instructions above, that comment is not valid. anlambert: Sorry, I did not see the instructions above, that comment is not valid. | |||||||||||||||
scheduler=scheduler, credentials=None, url=url, instance=instance, | |||||||||||||||
) | |||||||||||||||
self.session = requests.Session() | |||||||||||||||
self.session.headers.update( | |||||||||||||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | |||||||||||||||
) | |||||||||||||||
def _get_and_parse(self, url: str) -> BeautifulSoup: | |||||||||||||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup""" | |||||||||||||||
response = self.session.get(url) | |||||||||||||||
response.raise_for_status() | |||||||||||||||
tenmaUnsubmitted Not Done Inline Actionsadd a warning on any error != than 200 that dumps response content, like the other listers, to ease debugging. tenma: add a warning on any error != than 200 that dumps response content, like the other listers, to… | |||||||||||||||
tenmaUnsubmitted Not Done Inline Actions
tenma: | |||||||||||||||
return BeautifulSoup(response.text, features="html.parser") | |||||||||||||||
def get_repos(self) -> Generator[str, None, None]: | def get_pages(self) -> Iterator[Repositories]: | ||||||||||||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||||||||||||
""" | """ | ||||||||||||||
next_page = self.url | next_page: Optional[str] = self.url | ||||||||||||||
while next_page: | while next_page: | ||||||||||||||
bs_idx = self.get_and_parse(next_page) | bs_idx = self._get_and_parse(next_page) | ||||||||||||||
page_results = [] | |||||||||||||||
for tr in bs_idx.find("div", {"class": "content"}).find_all( | for tr in bs_idx.find("div", {"class": "content"}).find_all( | ||||||||||||||
"tr", {"class": ""} | "tr", {"class": ""} | ||||||||||||||
): | ): | ||||||||||||||
yield urljoin(self.url, tr.find("a")["href"]) | page_results.append(urljoin(self.url, tr.find("a")["href"])) | ||||||||||||||
yield page_results | |||||||||||||||
try: | try: | ||||||||||||||
pager = bs_idx.find("ul", {"class": "pager"}) | pager = bs_idx.find("ul", {"class": "pager"}) | ||||||||||||||
current_page = pager.find("a", {"class": "current"}) | current_page = pager.find("a", {"class": "current"}) | ||||||||||||||
if current_page: | if current_page: | ||||||||||||||
next_page = current_page.parent.next_sibling.a["href"] | next_page = current_page.parent.next_sibling.a["href"] | ||||||||||||||
next_page = urljoin(self.url, next_page) | next_page = urljoin(self.url, next_page) | ||||||||||||||
except (AttributeError, KeyError): | except (AttributeError, KeyError): | ||||||||||||||
tenmaUnsubmitted Not Done Inline ActionsHere I think the lister will stop without error if the site changes, but in this case we want it to fail clearly and not just end. tenma: Here I think the lister will stop without error if the site changes, but in this case we want… | |||||||||||||||
# no pager, or no next page | # no pager, or no next page | ||||||||||||||
next_page = None | next_page = None | ||||||||||||||
def build_model(self, repo_url: str) -> Optional[Dict[str, Any]]: | def get_origins_from_page( | ||||||||||||||
"""Given the URL of a git repo project page on a CGit server, | self, repositories: Repositories | ||||||||||||||
return the repo description (dict) suitable for insertion in the db. | ) -> Iterator[ListedOrigin]: | ||||||||||||||
""" | """Convert a page of cgit repositories into a list of ListedOrigins.""" | ||||||||||||||
bs = self.get_and_parse(repo_url) | assert self.lister_obj.id is not None | ||||||||||||||
for repository_url in repositories: | |||||||||||||||
origin_url = self._get_origin_from_repository_url(repository_url) | |||||||||||||||
if not origin_url: | |||||||||||||||
continue | |||||||||||||||
yield ListedOrigin( | |||||||||||||||
lister_id=self.lister_obj.id, | |||||||||||||||
url=origin_url, | |||||||||||||||
visit_type="git", | |||||||||||||||
Not Done Inline ActionsIt would be nice to compute a last_update value for the listed origins. On the main Web UI page of a cgit instance listing hosted repositories, there is an Idle column indicating anlambert: It would be nice to compute a `last_update` value for the listed origins.
On the main Web UI… | |||||||||||||||
Not Done Inline Actionsthis is not in a scope of that diff, just throwing the idea here ;-) anlambert: this is not in a scope of that diff, just throwing the idea here ;-) | |||||||||||||||
Done Inline Actionsexact, we have changed at iso-functionalities. vsellier: exact, we have changed at iso-functionalities.
I have created the T2988 to not forget it. | |||||||||||||||
last_update=None, | |||||||||||||||
) | |||||||||||||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | |||||||||||||||
"""Extract the git url from the repository page""" | |||||||||||||||
bs = self._get_and_parse(repository_url) | |||||||||||||||
# origin urls are listed on the repository page | |||||||||||||||
# TODO check if forcing https is better or not ? | |||||||||||||||
# <link rel='vcs-git' href='git://...' title='...'/> | |||||||||||||||
# <link rel='vcs-git' href='http://...' title='...'/> | |||||||||||||||
# <link rel='vcs-git' href='https://...' title='...'/> | |||||||||||||||
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] | urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] | ||||||||||||||
if not urls: | if not urls: | ||||||||||||||
return None | return None | ||||||||||||||
# look for the http/https url, if any, and use it as origin_url | # look for the http/https url, if any, and use it as origin_url | ||||||||||||||
for url in urls: | for url in urls: | ||||||||||||||
if urlparse(url).scheme in ("http", "https"): | if urlparse(url).scheme in ("http", "https"): | ||||||||||||||
origin_url = url | origin_url = url | ||||||||||||||
break | break | ||||||||||||||
else: | else: | ||||||||||||||
# otherwise, choose the first one | # otherwise, choose the first one | ||||||||||||||
origin_url = urls[0] | origin_url = urls[0] | ||||||||||||||
return origin_url | |||||||||||||||
return { | |||||||||||||||
"uid": repo_url, | |||||||||||||||
"name": bs.find("a", title=re.compile(".+"))["title"], | |||||||||||||||
"origin_type": "git", | |||||||||||||||
"instance": self.instance, | |||||||||||||||
"origin_url": origin_url, | |||||||||||||||
} | |||||||||||||||
def get_and_parse(self, url: str) -> BeautifulSoup: | |||||||||||||||
"Get the given url and parse the retrieved HTML using BeautifulSoup" | |||||||||||||||
return BeautifulSoup(self.session.get(url).text, features="html.parser") |
missing types for url and instance parameters