Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import random | import re | ||||
import logging | from urllib.parse import urlparse, urljoin | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | from requests import Session | ||||
from urllib.parse import urlparse | from requests.adapters import HTTPAdapter | ||||
from .models import CGitModel | from .models import CGitModel | ||||
from swh.lister.core.simple_lister import SimpleLister | from swh.core.utils import grouper | ||||
from swh.lister.core.lister_transports import ListerOnePageApiTransport | from swh.lister.core.lister_base import ListerBase | ||||
class CGitLister(ListerOnePageApiTransport, SimpleLister): | class CGitLister(ListerBase): | ||||
MODEL = CGitModel | MODEL = CGitModel | ||||
nahimilega: You could add a docstring to the class. Something like this https://forge.softwareheritage. | |||||
ardumontUnsubmitted Not Done Inline ActionsEither the class or the init as whatever is more suited for such documentation. ardumont: Either the class or the init as whatever is more suited for such documentation. | |||||
DEFAULT_URL = 'http://git.savannah.gnu.org/cgit/' | |||||
LISTER_NAME = 'cgit' | LISTER_NAME = 'cgit' | ||||
PAGE = None | |||||
url_prefix_present = True | url_prefix_present = True | ||||
def __init__(self, url, instance=None, url_prefix=None, | def __init__(self, url=None, instance=None, override_config=None): | ||||
override_config=None): | |||||
"""Inits Class with PAGE url and origin url prefix. | """Inits Class with PAGE url and origin url prefix. | ||||
Args: | Args: | ||||
url (str): URL of the CGit instance. | url (str): URL of the CGit instance. | ||||
instance (str): Name of cgit instance. | instance (str): Name of cgit instance. | ||||
url_prefix (str): Prefix of the origin_url. Origin link of the | |||||
repos of some special instances do not match | |||||
the url of the repository page, they have origin | |||||
url in the format <url_prefix>/<repo_name>. | |||||
""" | """ | ||||
self.PAGE = url | super().__init__(override_config=override_config) | ||||
if url_prefix is None: | |||||
self.url_prefix = url | |||||
self.url_prefix_present = False | |||||
else: | |||||
self.url_prefix = url_prefix | |||||
if not self.url_prefix.endswith('/'): | if url is None: | ||||
self.url_prefix += '/' | url = self.config.get('url', self.DEFAULT_URL) | ||||
url = urlparse(self.PAGE) | self.url = url | ||||
self.url_netloc = find_netloc(url) | |||||
if not instance: | if not instance: | ||||
instance = url.hostname | instance = urlparse(url).hostname | ||||
self.instance = instance | self.instance = instance | ||||
self.session = Session() | |||||
self.session.mount(self.url, HTTPAdapter(max_retries=3)) | |||||
ListerOnePageApiTransport .__init__(self) | def run(self): | ||||
SimpleLister.__init__(self, override_config=override_config) | for repos in grouper(self.get_repos(), 100): | ||||
models = list(filter(None, (self.build_model(repo) | |||||
def list_packages(self, response): | for repo in repos))) | ||||
"""List the actual cgit instance origins from the response. | injected_repos = self.inject_repo_data_into_db(models) | ||||
self.schedule_missing_tasks(models, injected_repos) | |||||
Find repositories metadata by parsing the html page (response's raw | self.db_session.commit() | ||||
content). If there are links in the html page, retrieve those | |||||
repositories metadata from those pages as well. Return the | def get_repos(self): | ||||
repositories as list of dictionaries. | """Generate git 'project' URLs found on the current CGit server | ||||
Args: | """ | ||||
response (Response): http api request response. | next_page = self.url | ||||
while next_page: | |||||
Returns: | idx = BeautifulSoup(self.session.get(next_page).text, | ||||
List of repository origin urls (as dict) included in the response. | features='html.parser') | ||||
for tr in idx.find( | |||||
""" | 'div', {"class": "content"}).find_all( | ||||
repos_details = [] | "tr", {"class": ""}): | ||||
yield urljoin(self.url, tr.find('a')['href']) | |||||
for repo in self.yield_repo_from_responses(response): | |||||
repo_name = repo.a.text | |||||
origin_url = self.find_origin_url(repo, repo_name) | |||||
try: | try: | ||||
Not Done Inline Actionsto be used ardumont: to be used | |||||
Done Inline Actionsseen that, and also 'gather published "Clone" URLs' (without 'the') douardda: seen that, and also 'gather published "Clone" URLs' (without 'the') | |||||
time = repo.span['title'] | pager = idx.find('ul', {'class': 'pager'}) | ||||
except Exception: | current_page = pager.find('a', {'class': 'current'}) | ||||
time = None | if current_page: | ||||
next_page = current_page.parent.next_sibling.a['href'] | |||||
if origin_url is not None: | next_page = urljoin(self.url, next_page) | ||||
repos_details.append({ | except (AttributeError, KeyError): | ||||
'name': repo_name, | # no pager, or no next page | ||||
'time': time, | next_page = None | ||||
'origin_url': origin_url, | |||||
}) | def build_model(self, repo_url): | ||||
"""Given the URL of a git repo project page on a CGit server, | |||||
random.shuffle(repos_details) | return the repo description (dict) suitable for insertion in the db. | ||||
return repos_details | """ | ||||
bs = BeautifulSoup(self.session.get(repo_url).text, | |||||
def yield_repo_from_responses(self, response): | features='html.parser') | ||||
nahimilegaUnsubmitted Not Done Inline ActionsThis line is the same as line 57. Maybe we could make a function for this. nahimilega: This line is the same as line 57. Maybe we could make a function for this. | |||||
"""Yield repositories from all pages of the cgit instance. | urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] | ||||
Finds the number of pages present and yields the list of | if not urls: | ||||
repositories present. | return | ||||
Args: | # look for the http/https url, if any, and use it as origin_url | ||||
response (Response): server response. | for url in urls: | ||||
if urlparse(url).scheme in ('http', 'https'): | |||||
Yields: | origin_url = url | ||||
List of beautifulsoup object of repository rows. | break | ||||
else: | |||||
""" | # otherwise, choose the first one | ||||
html = response.text | origin_url = urls[0] | ||||
yield from get_repo_list(html) | |||||
pages = self.get_pages(make_soup(html)) | |||||
if len(pages) > 1: | |||||
yield from self.get_repos_from_pages(pages[1:]) | |||||
def find_origin_url(self, repo, repo_name): | |||||
"""Finds the origin url for a repository | |||||
Args: | |||||
repo (Beautifulsoup): Beautifulsoup object of the repository | |||||
row present in base url. | |||||
repo_name (str): Repository name. | |||||
Returns: | |||||
string: origin url. | |||||
""" | |||||
if self.url_prefix_present: | |||||
return self.url_prefix + repo_name | |||||
return self.get_url(repo) | |||||
def get_pages(self, url_soup): | |||||
"""Find URL of all pages. | |||||
Finds URL of pages that are present by parsing over the HTML of | |||||
pagination present at the end of the page. | |||||
Args: | |||||
url_soup (Beautifulsoup): a beautifulsoup object of base URL | |||||
Returns: | |||||
list: URL of pages present for a cgit instance | |||||
""" | |||||
pages = url_soup.find('div', {"class": "content"}).find_all('li') | |||||
if not pages: | |||||
return [self.PAGE] | |||||
return [self.get_url(page) for page in pages] | |||||
def get_repos_from_pages(self, pages): | |||||
"""Find repos from all pages. | |||||
Request the available repos from the pages. This yields | |||||
the available repositories found as beautiful object representation. | |||||
Args: | |||||
pages ([str]): list of urls of all pages present for a | |||||
particular cgit instance. | |||||
Yields: | |||||
List of beautifulsoup object of repository (url) rows | |||||
present in pages(except first). | |||||
""" | |||||
for page in pages: | |||||
response = requests.get(page) | |||||
if not response.ok: | |||||
logging.warning('Failed to retrieve repositories from page %s', | |||||
page) | |||||
continue | |||||
yield from get_repo_list(response.text) | |||||
def get_url(self, repo): | |||||
"""Finds url of a repo page. | |||||
Finds the url of a repo page by parsing over the html of the row of | |||||
that repo present in the base url. | |||||
Args: | |||||
repo (Beautifulsoup): a beautifulsoup object of the repository | |||||
row present in base url. | |||||
Returns: | |||||
string: The url of a repo. | |||||
""" | |||||
suffix = repo.a['href'] | |||||
return self.url_netloc + suffix | |||||
def get_model_from_repo(self, repo): | |||||
"""Transform from repository representation to model. | |||||
""" | return {'uid': repo_url, | ||||
return { | 'name': bs.find('a', title=re.compile('.+'))['title'], | ||||
'uid': self.PAGE + repo['name'], | |||||
'name': repo['name'], | |||||
'full_name': repo['name'], | |||||
'html_url': repo['origin_url'], | |||||
'origin_url': repo['origin_url'], | |||||
'origin_type': 'git', | 'origin_type': 'git', | ||||
'time_updated': repo['time'], | |||||
'instance': self.instance, | 'instance': self.instance, | ||||
'origin_url': origin_url, | |||||
ardumontUnsubmitted Not Done Inline ActionsI guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null values (so it does not break ;). ardumont: I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null… | |||||
douarddaAuthorUnsubmitted Done Inline ActionsThat's the idea yes. douardda: That's the idea yes. | |||||
} | } | ||||
def transport_response_simplified(self, repos_details): | |||||
"""Transform response to list for model manipulation. | |||||
""" | |||||
return [self.get_model_from_repo(repo) for repo in repos_details] | |||||
def find_netloc(url): | |||||
"""Finds the network location from then url. | |||||
URL in the repo are relative to the network location part of base | |||||
URL, so we need to compute it to reconstruct URLs. | |||||
Args: | |||||
url (urllib): urllib object of url. | |||||
Returns: | |||||
string: Scheme and Network location part in the base URL. | |||||
Example: | |||||
For url = https://git.kernel.org/pub/scm/ | |||||
>>> find_netloc(url) | |||||
'https://git.kernel.org' | |||||
""" | |||||
return '%s://%s' % (url.scheme, url.netloc) | |||||
def get_repo_list(response): | |||||
"""Find repositories (as beautifulsoup object) available within the server | |||||
response. | |||||
Args: | |||||
response (Response): server response | |||||
Returns: | |||||
List all repositories as beautifulsoup object within the response. | |||||
""" | |||||
repo_soup = make_soup(response) | |||||
return repo_soup \ | |||||
.find('div', {"class": "content"}).find_all("tr", {"class": ""}) | |||||
def make_soup(response): | |||||
"""Instantiates a beautiful soup object from the response object. | |||||
""" | |||||
return BeautifulSoup(response, features="html.parser") |
You could add a docstring to the class. Something like this https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/packagist/lister.py$0-15
IIRC, we decided to make a docstring for the lister class which shows their output.
I forgot to create a task regarding this(my bad)