Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import re | import re | ||||
import logging | import logging | ||||
from urllib.parse import urlparse, urljoin | from urllib.parse import urlparse, urljoin | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
from requests import Session | from requests import Session | ||||
vlorentz: ? | |||||
# from requests.structures import CaseInsensitiveDict | |||||
from requests.adapters import HTTPAdapter | from requests.adapters import HTTPAdapter | ||||
from typing import Any, Dict, Generator, Union | |||||
from .models import CGitModel | from .models import CGitModel | ||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||
from swh.lister.core.lister_base import ListerBase | from swh.lister.core.lister_base import ListerBase | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Show All 28 Lines | Example:: | ||||
Args: | Args: | ||||
'https://git.savannah.gnu.org/git/elisp-es.git' | 'https://git.savannah.gnu.org/git/elisp-es.git' | ||||
""" | """ | ||||
MODEL = CGitModel | MODEL = CGitModel | ||||
DEFAULT_URL = 'https://git.savannah.gnu.org/cgit/' | DEFAULT_URL = 'https://git.savannah.gnu.org/cgit/' | ||||
LISTER_NAME = 'cgit' | LISTER_NAME = 'cgit' | ||||
url_prefix_present = True | url_prefix_present = True | ||||
def __init__(self, url=None, instance=None, override_config=None): | def __init__(self, url=None, instance=None, | ||||
Not Done Inline ActionsYou need to type all listers... ardumont: You need to type all listers... | |||||
override_config=None): | |||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
Args: | Args: | ||||
url (str): main URL of the CGit instance, i.e. url of the index | url : main URL of the CGit instance, i.e. url of the index | ||||
of published git repositories on this instance. | of published git repositories on this instance. | ||||
instance (str): Name of cgit instance. Defaults to url's hostname | instance : Name of cgit instance. Defaults to url's hostname | ||||
if unset. | if unset. | ||||
""" | """ | ||||
Not Done Inline Actionswhy did you remove annotations? vlorentz: why did you remove annotations? | |||||
super().__init__(override_config=override_config) | super().__init__(override_config=override_config) | ||||
if url is None: | if url is None: | ||||
url = self.config.get('url', self.DEFAULT_URL) | url = self.config.get('url', self.DEFAULT_URL) | ||||
self.url = url | self.url = url | ||||
if not instance: | if not instance: | ||||
instance = urlparse(url).hostname | instance = urlparse(url).hostname | ||||
self.instance = instance | self.instance = instance | ||||
self.session = Session() | self.session = Session() | ||||
self.session.mount(self.url, HTTPAdapter(max_retries=3)) | self.session.mount(self.url, HTTPAdapter(max_retries=3)) | ||||
self.session.headers = { | self.session.headers = { | ||||
'User-Agent': USER_AGENT, | 'User-Agent': USER_AGENT, | ||||
} | } | ||||
def run(self): | def run(self) -> Dict[str, str]: | ||||
status = 'uneventful' | status = 'uneventful' | ||||
total = 0 | total = 0 | ||||
for repos in grouper(self.get_repos(), 10): | for repos in grouper(self.get_repos(), 10): | ||||
models = list(filter(None, (self.build_model(repo) | models = list(filter(None, (self.build_model(repo) | ||||
for repo in repos))) | for repo in repos))) | ||||
injected_repos = self.inject_repo_data_into_db(models) | injected_repos = self.inject_repo_data_into_db(models) | ||||
self.schedule_missing_tasks(models, injected_repos) | self.schedule_missing_tasks(models, injected_repos) | ||||
self.db_session.commit() | self.db_session.commit() | ||||
total += len(injected_repos) | total += len(injected_repos) | ||||
logger.debug('Scheduled %s tasks for %s', total, self.url) | logger.debug('Scheduled %s tasks for %s', total, self.url) | ||||
status = 'eventful' | status = 'eventful' | ||||
return {'status': status} | return {'status': status} | ||||
def get_repos(self): | def get_repos(self) -> Generator: | ||||
Not Done Inline Actionsplease specify completely the type of the Generator vlorentz: please specify completely the type of the Generator | |||||
Not Done Inline Actionsyou can use Iterator[str] ardumont: you can use `Iterator[str]` | |||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
""" | """ | ||||
next_page = self.url | next_page = self.url | ||||
while next_page: | while next_page: | ||||
bs_idx = self.get_and_parse(next_page) | bs_idx = self.get_and_parse(next_page) | ||||
for tr in bs_idx.find( | for tr in bs_idx.find( | ||||
'div', {"class": "content"}).find_all( | 'div', {"class": "content"}).find_all( | ||||
"tr", {"class": ""}): | "tr", {"class": ""}): | ||||
yield urljoin(self.url, tr.find('a')['href']) | yield urljoin(self.url, tr.find('a')['href']) | ||||
try: | try: | ||||
pager = bs_idx.find('ul', {'class': 'pager'}) | pager = bs_idx.find('ul', {'class': 'pager'}) | ||||
current_page = pager.find('a', {'class': 'current'}) | current_page = pager.find('a', {'class': 'current'}) | ||||
if current_page: | if current_page: | ||||
next_page = current_page.parent.next_sibling.a['href'] | next_page = current_page.parent.next_sibling.a['href'] | ||||
next_page = urljoin(self.url, next_page) | next_page = urljoin(self.url, next_page) | ||||
except (AttributeError, KeyError): | except (AttributeError, KeyError): | ||||
# no pager, or no next page | # no pager, or no next page | ||||
next_page = None | next_page = None | ||||
def build_model(self, repo_url): | def build_model(self, repo_url: str) -> Union[None, Dict[str, Any]]: | ||||
Not Done Inline ActionsOptional[Dict[str, Any]] vlorentz: `Optional[Dict[str, Any]]` | |||||
"""Given the URL of a git repo project page on a CGit server, | """Given the URL of a git repo project page on a CGit server, | ||||
return the repo description (dict) suitable for insertion in the db. | return the repo description (dict) suitable for insertion in the db. | ||||
""" | """ | ||||
bs = self.get_and_parse(repo_url) | bs = self.get_and_parse(repo_url) | ||||
urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] | urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] | ||||
if not urls: | if not urls: | ||||
return | return None | ||||
Not Done Inline Actionsnot needed vlorentz: not needed | |||||
# look for the http/https url, if any, and use it as origin_url | # look for the http/https url, if any, and use it as origin_url | ||||
for url in urls: | for url in urls: | ||||
if urlparse(url).scheme in ('http', 'https'): | if urlparse(url).scheme in ('http', 'https'): | ||||
origin_url = url | origin_url = url | ||||
break | break | ||||
else: | else: | ||||
# otherwise, choose the first one | # otherwise, choose the first one | ||||
origin_url = urls[0] | origin_url = urls[0] | ||||
return {'uid': repo_url, | return {'uid': repo_url, | ||||
'name': bs.find('a', title=re.compile('.+'))['title'], | 'name': bs.find('a', title=re.compile('.+'))['title'], | ||||
'origin_type': 'git', | 'origin_type': 'git', | ||||
'instance': self.instance, | 'instance': self.instance, | ||||
'origin_url': origin_url, | 'origin_url': origin_url, | ||||
} | } | ||||
def get_and_parse(self, url): | def get_and_parse(self, url: str) -> BeautifulSoup: | ||||
"Get the given url and parse the retrieved HTML using BeautifulSoup" | "Get the given url and parse the retrieved HTML using BeautifulSoup" | ||||
return BeautifulSoup(self.session.get(url).text, | return BeautifulSoup(self.session.get(url).text, | ||||
features='html.parser') | features='html.parser') |
?