Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
| # Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019 the Software Heritage developers | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| import random | import re | ||||
| import logging | from urllib.parse import urlparse, urljoin | ||||
| from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
| import requests | from requests import Session | ||||
| from urllib.parse import urlparse | from requests.adapters import HTTPAdapter | ||||
| from .models import CGitModel | from .models import CGitModel | ||||
| from swh.lister.core.simple_lister import SimpleLister | from swh.core.utils import grouper | ||||
| from swh.lister.core.lister_transports import ListerOnePageApiTransport | from swh.lister.core.lister_base import ListerBase | ||||
| class CGitLister(ListerOnePageApiTransport, SimpleLister): | class CGitLister(ListerBase): | ||||
| MODEL = CGitModel | MODEL = CGitModel | ||||
nahimilega: You could add a docstring to the class. Something like this https://forge.softwareheritage. | |||||
ardumontUnsubmitted Not Done Inline ActionsEither the class or the init as whatever is more suited for such documentation. ardumont: Either the class or the init as whatever is more suited for such documentation. | |||||
| DEFAULT_URL = 'http://git.savannah.gnu.org/cgit/' | |||||
| LISTER_NAME = 'cgit' | LISTER_NAME = 'cgit' | ||||
| PAGE = None | |||||
| url_prefix_present = True | url_prefix_present = True | ||||
| def __init__(self, url, instance=None, url_prefix=None, | def __init__(self, url=None, instance=None, override_config=None): | ||||
| override_config=None): | |||||
| """Inits Class with PAGE url and origin url prefix. | """Inits Class with PAGE url and origin url prefix. | ||||
| Args: | Args: | ||||
| url (str): URL of the CGit instance. | url (str): URL of the CGit instance. | ||||
| instance (str): Name of cgit instance. | instance (str): Name of cgit instance. | ||||
| url_prefix (str): Prefix of the origin_url. Origin link of the | |||||
| repos of some special instances do not match | |||||
| the url of the repository page, they have origin | |||||
| url in the format <url_prefix>/<repo_name>. | |||||
| """ | """ | ||||
| self.PAGE = url | super().__init__(override_config=override_config) | ||||
| if url_prefix is None: | |||||
| self.url_prefix = url | |||||
| self.url_prefix_present = False | |||||
| else: | |||||
| self.url_prefix = url_prefix | |||||
| if not self.url_prefix.endswith('/'): | if url is None: | ||||
| self.url_prefix += '/' | url = self.config.get('url', self.DEFAULT_URL) | ||||
| url = urlparse(self.PAGE) | self.url = url | ||||
| self.url_netloc = find_netloc(url) | |||||
| if not instance: | if not instance: | ||||
| instance = url.hostname | instance = urlparse(url).hostname | ||||
| self.instance = instance | self.instance = instance | ||||
| self.session = Session() | |||||
| self.session.mount(self.url, HTTPAdapter(max_retries=3)) | |||||
| ListerOnePageApiTransport .__init__(self) | def run(self): | ||||
| SimpleLister.__init__(self, override_config=override_config) | for repos in grouper(self.get_repos(), 100): | ||||
| models = list(filter(None, (self.build_model(repo) | |||||
| def list_packages(self, response): | for repo in repos))) | ||||
| """List the actual cgit instance origins from the response. | injected_repos = self.inject_repo_data_into_db(models) | ||||
| self.schedule_missing_tasks(models, injected_repos) | |||||
| Find repositories metadata by parsing the html page (response's raw | self.db_session.commit() | ||||
| content). If there are links in the html page, retrieve those | |||||
| repositories metadata from those pages as well. Return the | def get_repos(self): | ||||
| repositories as list of dictionaries. | """Generate git 'project' URLs found on the current CGit server | ||||
| Args: | """ | ||||
| response (Response): http api request response. | next_page = self.url | ||||
| while next_page: | |||||
| Returns: | idx = BeautifulSoup(self.session.get(next_page).text, | ||||
| List of repository origin urls (as dict) included in the response. | features='html.parser') | ||||
| for tr in idx.find( | |||||
| """ | 'div', {"class": "content"}).find_all( | ||||
| repos_details = [] | "tr", {"class": ""}): | ||||
| yield urljoin(self.url, tr.find('a')['href']) | |||||
| for repo in self.yield_repo_from_responses(response): | |||||
| repo_name = repo.a.text | |||||
| origin_url = self.find_origin_url(repo, repo_name) | |||||
| try: | try: | ||||
Not Done Inline Actionsto be used ardumont: to be used | |||||
Done Inline Actionsseen that, and also 'gather published "Clone" URLs' (without 'the') douardda: seen that, and also 'gather published "Clone" URLs' (without 'the') | |||||
| time = repo.span['title'] | pager = idx.find('ul', {'class': 'pager'}) | ||||
| except Exception: | current_page = pager.find('a', {'class': 'current'}) | ||||
| time = None | if current_page: | ||||
| next_page = current_page.parent.next_sibling.a['href'] | |||||
| if origin_url is not None: | next_page = urljoin(self.url, next_page) | ||||
| repos_details.append({ | except (AttributeError, KeyError): | ||||
| 'name': repo_name, | # no pager, or no next page | ||||
| 'time': time, | next_page = None | ||||
| 'origin_url': origin_url, | |||||
| }) | def build_model(self, repo_url): | ||||
| """Given the URL of a git repo project page on a CGit server, | |||||
| random.shuffle(repos_details) | return the repo description (dict) suitable for insertion in the db. | ||||
| return repos_details | """ | ||||
| bs = BeautifulSoup(self.session.get(repo_url).text, | |||||
| def yield_repo_from_responses(self, response): | features='html.parser') | ||||
nahimilegaUnsubmitted Not Done Inline ActionsThis line is the same as line 57. Maybe we could make a function for this. nahimilega: This line is the same as line 57. Maybe we could make a function for this. | |||||
| """Yield repositories from all pages of the cgit instance. | urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})] | ||||
| Finds the number of pages present and yields the list of | if not urls: | ||||
| repositories present. | return | ||||
| Args: | # look for the http/https url, if any, and use it as origin_url | ||||
| response (Response): server response. | for url in urls: | ||||
| if urlparse(url).scheme in ('http', 'https'): | |||||
| Yields: | origin_url = url | ||||
| List of beautifulsoup object of repository rows. | break | ||||
| else: | |||||
| """ | # otherwise, choose the first one | ||||
| html = response.text | origin_url = urls[0] | ||||
| yield from get_repo_list(html) | |||||
| pages = self.get_pages(make_soup(html)) | |||||
| if len(pages) > 1: | |||||
| yield from self.get_repos_from_pages(pages[1:]) | |||||
| def find_origin_url(self, repo, repo_name): | |||||
| """Finds the origin url for a repository | |||||
| Args: | |||||
| repo (Beautifulsoup): Beautifulsoup object of the repository | |||||
| row present in base url. | |||||
| repo_name (str): Repository name. | |||||
| Returns: | |||||
| string: origin url. | |||||
| """ | |||||
| if self.url_prefix_present: | |||||
| return self.url_prefix + repo_name | |||||
| return self.get_url(repo) | |||||
| def get_pages(self, url_soup): | |||||
| """Find URL of all pages. | |||||
| Finds URL of pages that are present by parsing over the HTML of | |||||
| pagination present at the end of the page. | |||||
| Args: | |||||
| url_soup (Beautifulsoup): a beautifulsoup object of base URL | |||||
| Returns: | |||||
| list: URL of pages present for a cgit instance | |||||
| """ | |||||
| pages = url_soup.find('div', {"class": "content"}).find_all('li') | |||||
| if not pages: | |||||
| return [self.PAGE] | |||||
| return [self.get_url(page) for page in pages] | |||||
| def get_repos_from_pages(self, pages): | |||||
| """Find repos from all pages. | |||||
| Request the available repos from the pages. This yields | |||||
| the available repositories found as beautiful object representation. | |||||
| Args: | |||||
| pages ([str]): list of urls of all pages present for a | |||||
| particular cgit instance. | |||||
| Yields: | |||||
| List of beautifulsoup object of repository (url) rows | |||||
| present in pages(except first). | |||||
| """ | |||||
| for page in pages: | |||||
| response = requests.get(page) | |||||
| if not response.ok: | |||||
| logging.warning('Failed to retrieve repositories from page %s', | |||||
| page) | |||||
| continue | |||||
| yield from get_repo_list(response.text) | |||||
| def get_url(self, repo): | |||||
| """Finds url of a repo page. | |||||
| Finds the url of a repo page by parsing over the html of the row of | |||||
| that repo present in the base url. | |||||
| Args: | |||||
| repo (Beautifulsoup): a beautifulsoup object of the repository | |||||
| row present in base url. | |||||
| Returns: | |||||
| string: The url of a repo. | |||||
| """ | |||||
| suffix = repo.a['href'] | |||||
| return self.url_netloc + suffix | |||||
| def get_model_from_repo(self, repo): | |||||
| """Transform from repository representation to model. | |||||
| """ | return {'uid': repo_url, | ||||
| return { | 'name': bs.find('a', title=re.compile('.+'))['title'], | ||||
| 'uid': self.PAGE + repo['name'], | |||||
| 'name': repo['name'], | |||||
| 'full_name': repo['name'], | |||||
| 'html_url': repo['origin_url'], | |||||
| 'origin_url': repo['origin_url'], | |||||
| 'origin_type': 'git', | 'origin_type': 'git', | ||||
| 'time_updated': repo['time'], | |||||
| 'instance': self.instance, | 'instance': self.instance, | ||||
| 'origin_url': origin_url, | |||||
ardumontUnsubmitted Not Done Inline ActionsI guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null values (so it does not break ;). ardumont: I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null… | |||||
douarddaAuthorUnsubmitted Done Inline ActionsThat's the idea yes. douardda: That's the idea yes. | |||||
| } | } | ||||
| def transport_response_simplified(self, repos_details): | |||||
| """Transform response to list for model manipulation. | |||||
| """ | |||||
| return [self.get_model_from_repo(repo) for repo in repos_details] | |||||
| def find_netloc(url): | |||||
| """Finds the network location from then url. | |||||
| URL in the repo are relative to the network location part of base | |||||
| URL, so we need to compute it to reconstruct URLs. | |||||
| Args: | |||||
| url (urllib): urllib object of url. | |||||
| Returns: | |||||
| string: Scheme and Network location part in the base URL. | |||||
| Example: | |||||
| For url = https://git.kernel.org/pub/scm/ | |||||
| >>> find_netloc(url) | |||||
| 'https://git.kernel.org' | |||||
| """ | |||||
| return '%s://%s' % (url.scheme, url.netloc) | |||||
| def get_repo_list(response): | |||||
| """Find repositories (as beautifulsoup object) available within the server | |||||
| response. | |||||
| Args: | |||||
| response (Response): server response | |||||
| Returns: | |||||
| List all repositories as beautifulsoup object within the response. | |||||
| """ | |||||
| repo_soup = make_soup(response) | |||||
| return repo_soup \ | |||||
| .find('div', {"class": "content"}).find_all("tr", {"class": ""}) | |||||
| def make_soup(response): | |||||
| """Instantiates a beautiful soup object from the response object. | |||||
| """ | |||||
| return BeautifulSoup(response, features="html.parser") | |||||
You could add a docstring to the class. Something like this https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/packagist/lister.py$0-15
IIRC, we decided to make a docstring for the lister class which shows their output.
I forgot to create a task regarding this(my bad)