Differential D1929 Diff 6505 swh/lister/cgit/lister.py

Changeset View

Standalone View

swh/lister/cgit/lister.py

	# Copyright (C) 2019 the Software Heritage developers			# Copyright (C) 2019 the Software Heritage developers
	# License: GNU General Public License version 3, or any later version			# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information			# See top-level LICENSE file for more information

	import random			import re
	import logging			from urllib.parse import urlparse, urljoin

	from bs4 import BeautifulSoup			from bs4 import BeautifulSoup
	import requests			from requests import Session
	from urllib.parse import urlparse			from requests.adapters import HTTPAdapter

	from .models import CGitModel			from .models import CGitModel

	from swh.lister.core.simple_lister import SimpleLister			from swh.core.utils import grouper
				nahimilegaUnsubmitted Not Done Inline Actions You could add a docstring to the class. Something like this https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/packagist/lister.py$0-15 IIRC, we decided to make a docstring for the lister class which shows their output. I forgot to create a task regarding this(my bad) nahimilega: You could add a docstring to the class. Something like this https://forge.softwareheritage.
				ardumontUnsubmitted Not Done Inline Actions Either the class or the init as whatever is more suited for such documentation. ardumont: Either the class or the init as whatever is more suited for such documentation.
	from swh.lister.core.lister_transports import ListerOnePageApiTransport			from swh.lister.core.lister_base import ListerBase


	class CGitLister(ListerOnePageApiTransport, SimpleLister):			class CGitLister(ListerBase):
	MODEL = CGitModel			"""Lister class for CGit repositories.
	LISTER_NAME = 'cgit'
	PAGE = None
	url_prefix_present = True

	def __init__(self, url, instance=None, url_prefix=None,			This lister will retrieve the list of published git repositories by
	override_config=None):			parsing the HTML page(s) of the index retrieved at `url`.
	"""Inits Class with PAGE url and origin url prefix.

	Args:			For each found git repository, a query is made at the given url found
				ardumontUnsubmitted Not Done Inline Actions to be used ardumont: to be used
				douarddaAuthorUnsubmitted Done Inline Actions seen that, and also 'gather published "Clone" URLs' (without 'the') douardda: seen that, and also 'gather published "Clone" URLs' (without 'the')
	url (str): URL of the CGit instance.			in this index to gather published "Clone" URLs to be used as origin
	instance (str): Name of cgit instance.			URL for that git repo.
	url_prefix (str): Prefix of the origin_url. Origin link of the
	repos of some special instances do not match
	the url of the repository page, they have origin
	url in the format <url_prefix>/<repo_name>.

	"""			If several "Clone" urls are provided, prefer the http/https one, if
	self.PAGE = url			any, otherwise fall bak to the first one.
	if url_prefix is None:
	self.url_prefix = url
	self.url_prefix_present = False
	else:
	self.url_prefix = url_prefix

	if not self.url_prefix.endswith('/'):
	self.url_prefix += '/'
	url = urlparse(self.PAGE)
	self.url_netloc = find_netloc(url)

	if not instance:
	instance = url.hostname
	self.instance = instance

	ListerOnePageApiTransport .__init__(self)			A loader task is created for each git repository:
	SimpleLister.__init__(self, override_config=override_config)

	def list_packages(self, response):
	"""List the actual cgit instance origins from the response.

	Find repositories metadata by parsing the html page (response's raw
	content). If there are links in the html page, retrieve those
	repositories metadata from those pages as well. Return the
	repositories as list of dictionaries.

				Task:
				Type: load-git
				Policy: recurring
	Args:			Args:
	response (Response): http api request response.			<git_clonable_url>

	Returns:
	List of repository origin urls (as dict) included in the response.

	"""
	repos_details = []

	for repo in self.yield_repo_from_responses(response):
	repo_name = repo.a.text
	origin_url = self.find_origin_url(repo, repo_name)

	try:
	time = repo.span['title']
	except Exception:
	time = None

	if origin_url is not None:
	repos_details.append({
	'name': repo_name,
	'time': time,
	'origin_url': origin_url,
	})

	random.shuffle(repos_details)
	return repos_details

	def yield_repo_from_responses(self, response):
	"""Yield repositories from all pages of the cgit instance.

	Finds the number of pages present and yields the list of
	repositories present.

	Args:
	response (Response): server response.

	Yields:
	List of beautifulsoup object of repository rows.

	"""
	html = response.text
	yield from get_repo_list(html)
	pages = self.get_pages(make_soup(html))
	if len(pages) > 1:
	yield from self.get_repos_from_pages(pages[1:])

	def find_origin_url(self, repo, repo_name):
	"""Finds the origin url for a repository

	Args:
	repo (Beautifulsoup): Beautifulsoup object of the repository
	row present in base url.
	repo_name (str): Repository name.

	Returns:
	string: origin url.

	"""
	if self.url_prefix_present:
	return self.url_prefix + repo_name

	return self.get_url(repo)

	def get_pages(self, url_soup):
	"""Find URL of all pages.

	Finds URL of pages that are present by parsing over the HTML of
	pagination present at the end of the page.

				Example:
				Type: load-git
				Policy: recurring
	Args:			Args:
	url_soup (Beautifulsoup): a beautifulsoup object of base URL			'https://git.savannah.gnu.org/git/elisp-es.git'

	Returns:
	list: URL of pages present for a cgit instance

	"""			"""
	pages = url_soup.find('div', {"class": "content"}).find_all('li')			MODEL = CGitModel
				DEFAULT_URL = 'http://git.savannah.gnu.org/cgit/'
	if not pages:			LISTER_NAME = 'cgit'
	return [self.PAGE]			url_prefix_present = True

	return [self.get_url(page) for page in pages]

	def get_repos_from_pages(self, pages):
	"""Find repos from all pages.

	Request the available repos from the pages. This yields			def __init__(self, url=None, instance=None, override_config=None):
	the available repositories found as beautiful object representation.			"""Lister class for CGit repositories.

	Args:			Args:
	pages ([str]): list of urls of all pages present for a			url (str): main URL of the CGit instance, i.e. url of the index
	particular cgit instance.			of published git repositories on this instance.
				instance (str): Name of cgit instance. Defaults to url's hostname
	Yields:			if unset.
	List of beautifulsoup object of repository (url) rows
	present in pages(except first).

	"""			"""
	for page in pages:			super().__init__(override_config=override_config)
	response = requests.get(page)
	if not response.ok:
	logging.warning('Failed to retrieve repositories from page %s',
	page)
	continue

	yield from get_repo_list(response.text)

	def get_url(self, repo):
	"""Finds url of a repo page.

	Finds the url of a repo page by parsing over the html of the row of			if url is None:
	that repo present in the base url.			url = self.config.get('url', self.DEFAULT_URL)
				self.url = url
	Args:
	repo (Beautifulsoup): a beautifulsoup object of the repository
	row present in base url.

	Returns:			if not instance:
	string: The url of a repo.			instance = urlparse(url).hostname
				self.instance = instance
				self.session = Session()
				self.session.mount(self.url, HTTPAdapter(max_retries=3))

	"""			def run(self):
	suffix = repo.a['href']			for repos in grouper(self.get_repos(), 100):
	return self.url_netloc + suffix			models = list(filter(None, (self.build_model(repo)
				for repo in repos)))
				injected_repos = self.inject_repo_data_into_db(models)
				self.schedule_missing_tasks(models, injected_repos)
				self.db_session.commit()

				def get_repos(self):
				"""Generate git 'project' URLs found on the current CGit server

				"""
				next_page = self.url
				while next_page:
				bs_idx = self.get_and_parse(next_page)
				for tr in bs_idx.find(
				'div', {"class": "content"}).find_all(
				"tr", {"class": ""}):
				yield urljoin(self.url, tr.find('a')['href'])

	def get_model_from_repo(self, repo):			try:
	"""Transform from repository representation to model.			pager = bs_idx.find('ul', {'class': 'pager'})
				current_page = pager.find('a', {'class': 'current'})
				if current_page:
				next_page = current_page.parent.next_sibling.a['href']
				next_page = urljoin(self.url, next_page)
				except (AttributeError, KeyError):
				# no pager, or no next page
				next_page = None

				def build_model(self, repo_url):
				"""Given the URL of a git repo project page on a CGit server,
				return the repo description (dict) suitable for insertion in the db.
				"""
				bs = self.get_and_parse(repo_url)
				urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})]

				if not urls:
				return

				# look for the http/https url, if any, and use it as origin_url
				for url in urls:
				if urlparse(url).scheme in ('http', 'https'):
				origin_url = url
				break
				else:
				# otherwise, choose the first one
				origin_url = urls[0]

	"""			return {'uid': repo_url,
	return {			'name': bs.find('a', title=re.compile('.+'))['title'],
	'uid': self.PAGE + repo['name'],
	'name': repo['name'],
	'full_name': repo['name'],
	'html_url': repo['origin_url'],
	'origin_url': repo['origin_url'],
	'origin_type': 'git',			'origin_type': 'git',
	'time_updated': repo['time'],
	'instance': self.instance,			'instance': self.instance,
				'origin_url': origin_url,
	}			}

	def transport_response_simplified(self, repos_details):			def get_and_parse(self, url):
	"""Transform response to list for model manipulation.			"Get the given url and parse the retrieved HTML using BeautifulSoup"
				return BeautifulSoup(self.session.get(url).text,
	"""			features='html.parser')
				ardumontUnsubmitted Not Done Inline Actions I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null values (so it does not break ;). And are also what needs to be further dealt with via T1978 ardumont: I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null…
				douarddaAuthorUnsubmitted Done Inline Actions That's the idea yes. douardda: That's the idea yes.
				nahimilegaUnsubmitted Not Done Inline Actions This line is the same as line 57. Maybe we could make a function for this. nahimilega: This line is the same as line 57. Maybe we could make a function for this.
	return [self.get_model_from_repo(repo) for repo in repos_details]


	def find_netloc(url):
	"""Finds the network location from then url.

	URL in the repo are relative to the network location part of base
	URL, so we need to compute it to reconstruct URLs.

	Args:
	url (urllib): urllib object of url.

	Returns:
	string: Scheme and Network location part in the base URL.

	Example:
	For url = https://git.kernel.org/pub/scm/
	>>> find_netloc(url)
	'https://git.kernel.org'

	"""
	return '%s://%s' % (url.scheme, url.netloc)


	def get_repo_list(response):
	"""Find repositories (as beautifulsoup object) available within the server
	response.

	Args:
	response (Response): server response

	Returns:
	List all repositories as beautifulsoup object within the response.

	"""
	repo_soup = make_soup(response)
	return repo_soup \
	.find('div', {"class": "content"}).find_all("tr", {"class": ""})


	def make_soup(response):
	"""Instantiates a beautiful soup object from the response object.

	"""
	return BeautifulSoup(response, features="html.parser")