Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import random | import re | ||||
import logging | from urllib.parse import urlparse, urljoin | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | from requests import Session | ||||
from urllib.parse import urlparse | from requests.adapters import HTTPAdapter | ||||
from .models import CGitModel | from .models import CGitModel | ||||
from swh.lister.core.simple_lister import SimpleLister | from swh.core.utils import grouper | ||||
nahimilega: You could add a docstring to the class. Something like this https://forge.softwareheritage. | |||||
Not Done Inline ActionsEither the class or the init as whatever is more suited for such documentation. ardumont: Either the class or the init as whatever is more suited for such documentation. | |||||
from swh.lister.core.lister_transports import ListerOnePageApiTransport | from swh.lister.core.lister_base import ListerBase | ||||
class CGitLister(ListerOnePageApiTransport, SimpleLister): | class CGitLister(ListerBase): | ||||
MODEL = CGitModel | """Lister class for CGit repositories. | ||||
LISTER_NAME = 'cgit' | |||||
PAGE = None | |||||
url_prefix_present = True | |||||
def __init__(self, url, instance=None, url_prefix=None, | This lister will retrieve the list of published git repositories by | ||||
override_config=None): | parsing the HTML page(s) of the index retrieved at `url`. | ||||
"""Inits Class with PAGE url and origin url prefix. | |||||
Args: | For each found git repository, a query is made at the given url found | ||||
Not Done Inline Actionsto be used ardumont: to be used | |||||
Done Inline Actionsseen that, and also 'gather published "Clone" URLs' (without 'the') douardda: seen that, and also 'gather published "Clone" URLs' (without 'the') | |||||
url (str): URL of the CGit instance. | in this index to gather published "Clone" URLs to be used as origin | ||||
instance (str): Name of cgit instance. | URL for that git repo. | ||||
url_prefix (str): Prefix of the origin_url. Origin link of the | |||||
repos of some special instances do not match | |||||
the url of the repository page, they have origin | |||||
url in the format <url_prefix>/<repo_name>. | |||||
""" | If several "Clone" urls are provided, prefer the http/https one, if | ||||
self.PAGE = url | any, otherwise fall bak to the first one. | ||||
if url_prefix is None: | |||||
self.url_prefix = url | |||||
self.url_prefix_present = False | |||||
else: | |||||
self.url_prefix = url_prefix | |||||
if not self.url_prefix.endswith('/'): | |||||
self.url_prefix += '/' | |||||
url = urlparse(self.PAGE) | |||||
self.url_netloc = find_netloc(url) | |||||
if not instance: | |||||
instance = url.hostname | |||||
self.instance = instance | |||||
ListerOnePageApiTransport .__init__(self) | A loader task is created for each git repository: | ||||
SimpleLister.__init__(self, override_config=override_config) | |||||
def list_packages(self, response): | |||||
"""List the actual cgit instance origins from the response. | |||||
Find repositories metadata by parsing the html page (response's raw | |||||
content). If there are links in the html page, retrieve those | |||||
repositories metadata from those pages as well. Return the | |||||
repositories as list of dictionaries. | |||||
Task: | |||||
Type: load-git | |||||
Policy: recurring | |||||
Args: | Args: | ||||
response (Response): http api request response. | <git_clonable_url> | ||||
Returns: | |||||
List of repository origin urls (as dict) included in the response. | |||||
""" | |||||
repos_details = [] | |||||
for repo in self.yield_repo_from_responses(response): | |||||
repo_name = repo.a.text | |||||
origin_url = self.find_origin_url(repo, repo_name) | |||||
try: | |||||
time = repo.span['title'] | |||||
except Exception: | |||||
time = None | |||||
if origin_url is not None: | |||||
repos_details.append({ | |||||
'name': repo_name, | |||||
'time': time, | |||||
'origin_url': origin_url, | |||||
}) | |||||
random.shuffle(repos_details) | |||||
return repos_details | |||||
def yield_repo_from_responses(self, response): | |||||
"""Yield repositories from all pages of the cgit instance. | |||||
Finds the number of pages present and yields the list of | |||||
repositories present. | |||||
Args: | |||||
response (Response): server response. | |||||
Yields: | |||||
List of beautifulsoup object of repository rows. | |||||
""" | |||||
html = response.text | |||||
yield from get_repo_list(html) | |||||
pages = self.get_pages(make_soup(html)) | |||||
if len(pages) > 1: | |||||
yield from self.get_repos_from_pages(pages[1:]) | |||||
def find_origin_url(self, repo, repo_name): | |||||
"""Finds the origin url for a repository | |||||
Args: | |||||
repo (Beautifulsoup): Beautifulsoup object of the repository | |||||
row present in base url. | |||||
repo_name (str): Repository name. | |||||
Returns: | |||||
string: origin url. | |||||
""" | |||||
if self.url_prefix_present: | |||||
return self.url_prefix + repo_name | |||||
return self.get_url(repo) | |||||
def get_pages(self, url_soup): | |||||
"""Find URL of all pages. | |||||
Finds URL of pages that are present by parsing over the HTML of | |||||
pagination present at the end of the page. | |||||
Example: | |||||
Type: load-git | |||||
Policy: recurring | |||||
Args: | Args: | ||||
url_soup (Beautifulsoup): a beautifulsoup object of base URL | 'https://git.savannah.gnu.org/git/elisp-es.git' | ||||
Returns: | |||||
list: URL of pages present for a cgit instance | |||||
""" | """ | ||||
pages = url_soup.find('div', {"class": "content"}).find_all('li') | MODEL = CGitModel | ||||
DEFAULT_URL = 'http://git.savannah.gnu.org/cgit/' | |||||
if not pages: | LISTER_NAME = 'cgit' | ||||
return [self.PAGE] | url_prefix_present = True | ||||
return [self.get_url(page) for page in pages] | |||||
def get_repos_from_pages(self, pages): | |||||
"""Find repos from all pages. | |||||
Request the available repos from the pages. This yields | def __init__(self, url=None, instance=None, override_config=None): | ||||
the available repositories found as beautiful object representation. | """Lister class for CGit repositories. | ||||
Args: | Args: | ||||
pages ([str]): list of urls of all pages present for a | url (str): main URL of the CGit instance, i.e. url of the index | ||||
particular cgit instance. | of published git repositories on this instance. | ||||
instance (str): Name of cgit instance. Defaults to url's hostname | |||||
Yields: | if unset. | ||||
List of beautifulsoup object of repository (url) rows | |||||
present in pages(except first). | |||||
""" | """ | ||||
for page in pages: | super().__init__(override_config=override_config) | ||||
response = requests.get(page) | |||||
if not response.ok: | |||||
logging.warning('Failed to retrieve repositories from page %s', | |||||
page) | |||||
continue | |||||
yield from get_repo_list(response.text) | |||||
def get_url(self, repo): | |||||
"""Finds url of a repo page. | |||||
Finds the url of a repo page by parsing over the html of the row of | if url is None: | ||||
that repo present in the base url. | url = self.config.get('url', self.DEFAULT_URL) | ||||
self.url = url | |||||
Args: | |||||
repo (Beautifulsoup): a beautifulsoup object of the repository | |||||
row present in base url. | |||||
Returns: | if not instance: | ||||
string: The url of a repo. | instance = urlparse(url).hostname | ||||
self.instance = instance | |||||
self.session = Session() | |||||
self.session.mount(self.url, HTTPAdapter(max_retries=3)) | |||||
""" | def run(self): | ||||
suffix = repo.a['href'] | for repos in grouper(self.get_repos(), 100): | ||||
return self.url_netloc + suffix | models = list(filter(None, (self.build_model(repo) | ||||
for repo in repos))) | |||||
injected_repos = self.inject_repo_data_into_db(models) | |||||
self.schedule_missing_tasks(models, injected_repos) | |||||
self.db_session.commit() | |||||
def get_repos(self): | |||||
"""Generate git 'project' URLs found on the current CGit server | |||||
""" | |||||
next_page = self.url | |||||
while next_page: | |||||
bs_idx = self.get_and_parse(next_page) | |||||
for tr in bs_idx.find( | |||||
'div', {"class": "content"}).find_all( | |||||
"tr", {"class": ""}): | |||||
yield urljoin(self.url, tr.find('a')['href']) | |||||
def get_model_from_repo(self, repo): | try: | ||||
"""Transform from repository representation to model. | pager = bs_idx.find('ul', {'class': 'pager'}) | ||||
current_page = pager.find('a', {'class': 'current'}) | |||||
if current_page: | |||||
next_page = current_page.parent.next_sibling.a['href'] | |||||
next_page = urljoin(self.url, next_page) | |||||
except (AttributeError, KeyError): | |||||
# no pager, or no next page | |||||
next_page = None | |||||
def build_model(self, repo_url): | |||||
"""Given the URL of a git repo project page on a CGit server, | |||||
return the repo description (dict) suitable for insertion in the db. | |||||
""" | |||||
bs = self.get_and_parse(repo_url) | |||||
urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] | |||||
if not urls: | |||||
return | |||||
# look for the http/https url, if any, and use it as origin_url | |||||
for url in urls: | |||||
if urlparse(url).scheme in ('http', 'https'): | |||||
origin_url = url | |||||
break | |||||
else: | |||||
# otherwise, choose the first one | |||||
origin_url = urls[0] | |||||
""" | return {'uid': repo_url, | ||||
return { | 'name': bs.find('a', title=re.compile('.+'))['title'], | ||||
'uid': self.PAGE + repo['name'], | |||||
'name': repo['name'], | |||||
'full_name': repo['name'], | |||||
'html_url': repo['origin_url'], | |||||
'origin_url': repo['origin_url'], | |||||
'origin_type': 'git', | 'origin_type': 'git', | ||||
'time_updated': repo['time'], | |||||
'instance': self.instance, | 'instance': self.instance, | ||||
'origin_url': origin_url, | |||||
} | } | ||||
def transport_response_simplified(self, repos_details): | def get_and_parse(self, url): | ||||
"""Transform response to list for model manipulation. | "Get the given url and parse the retrieved HTML using BeautifulSoup" | ||||
return BeautifulSoup(self.session.get(url).text, | |||||
""" | features='html.parser') | ||||
Not Done Inline ActionsI guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null values (so it does not break ;). ardumont: I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null… | |||||
Done Inline ActionsThat's the idea yes. douardda: That's the idea yes. | |||||
Not Done Inline ActionsThis line is the same as line 57. Maybe we could make a function for this. nahimilega: This line is the same as line 57. Maybe we could make a function for this. | |||||
return [self.get_model_from_repo(repo) for repo in repos_details] | |||||
def find_netloc(url): | |||||
"""Finds the network location from then url. | |||||
URL in the repo are relative to the network location part of base | |||||
URL, so we need to compute it to reconstruct URLs. | |||||
Args: | |||||
url (urllib): urllib object of url. | |||||
Returns: | |||||
string: Scheme and Network location part in the base URL. | |||||
Example: | |||||
For url = https://git.kernel.org/pub/scm/ | |||||
>>> find_netloc(url) | |||||
'https://git.kernel.org' | |||||
""" | |||||
return '%s://%s' % (url.scheme, url.netloc) | |||||
def get_repo_list(response): | |||||
"""Find repositories (as beautifulsoup object) available within the server | |||||
response. | |||||
Args: | |||||
response (Response): server response | |||||
Returns: | |||||
List all repositories as beautifulsoup object within the response. | |||||
""" | |||||
repo_soup = make_soup(response) | |||||
return repo_soup \ | |||||
.find('div', {"class": "content"}).find_all("tr", {"class": ""}) | |||||
def make_soup(response): | |||||
"""Instantiates a beautiful soup object from the response object. | |||||
""" | |||||
return BeautifulSoup(response, features="html.parser") |
You could add a docstring to the class. Something like this https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/packagist/lister.py$0-15
IIRC, we decided to make a docstring for the lister class which shows their output.
I forgot to create a task regarding this(my bad)