Differential D1929 Diff 6485 swh/lister/cgit/lister.py

Changeset View

Standalone View

swh/lister/cgit/lister.py

	# Copyright (C) 2019 the Software Heritage developers			# Copyright (C) 2019 the Software Heritage developers
	# License: GNU General Public License version 3, or any later version			# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information			# See top-level LICENSE file for more information

	import random			import re
	import logging			from urllib.parse import urlparse, urljoin

	from bs4 import BeautifulSoup			from bs4 import BeautifulSoup
	import requests			from requests import Session
	from urllib.parse import urlparse			from requests.adapters import HTTPAdapter

	from .models import CGitModel			from .models import CGitModel

	from swh.lister.core.simple_lister import SimpleLister			from swh.core.utils import grouper
	from swh.lister.core.lister_transports import ListerOnePageApiTransport			from swh.lister.core.lister_base import ListerBase


	class CGitLister(ListerOnePageApiTransport, SimpleLister):			class CGitLister(ListerBase):
	MODEL = CGitModel			MODEL = CGitModel
				nahimilegaUnsubmitted Not Done Inline Actions You could add a docstring to the class. Something like this https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/packagist/lister.py$0-15 IIRC, we decided to make a docstring for the lister class which shows their output. I forgot to create a task regarding this(my bad) nahimilega: You could add a docstring to the class. Something like this https://forge.softwareheritage.
				ardumontUnsubmitted Not Done Inline Actions Either the class or the init as whatever is more suited for such documentation. ardumont: Either the class or the init as whatever is more suited for such documentation.
				DEFAULT_URL = 'http://git.savannah.gnu.org/cgit/'
	LISTER_NAME = 'cgit'			LISTER_NAME = 'cgit'
	PAGE = None
	url_prefix_present = True			url_prefix_present = True

	def __init__(self, url, instance=None, url_prefix=None,			def __init__(self, url=None, instance=None, override_config=None):
	override_config=None):
	"""Inits Class with PAGE url and origin url prefix.			"""Inits Class with PAGE url and origin url prefix.

	Args:			Args:
	url (str): URL of the CGit instance.			url (str): URL of the CGit instance.
	instance (str): Name of cgit instance.			instance (str): Name of cgit instance.
	url_prefix (str): Prefix of the origin_url. Origin link of the
	repos of some special instances do not match
	the url of the repository page, they have origin
	url in the format <url_prefix>/<repo_name>.

	"""			"""
	self.PAGE = url			super().__init__(override_config=override_config)
	if url_prefix is None:
	self.url_prefix = url
	self.url_prefix_present = False
	else:
	self.url_prefix = url_prefix

	if not self.url_prefix.endswith('/'):			if url is None:
	self.url_prefix += '/'			url = self.config.get('url', self.DEFAULT_URL)
	url = urlparse(self.PAGE)			self.url = url
	self.url_netloc = find_netloc(url)

	if not instance:			if not instance:
	instance = url.hostname			instance = urlparse(url).hostname
	self.instance = instance			self.instance = instance
				self.session = Session()
				self.session.mount(self.url, HTTPAdapter(max_retries=3))

	ListerOnePageApiTransport .__init__(self)			def run(self):
	SimpleLister.__init__(self, override_config=override_config)			for repos in grouper(self.get_repos(), 100):
				models = list(filter(None, (self.build_model(repo)
	def list_packages(self, response):			for repo in repos)))
	"""List the actual cgit instance origins from the response.			injected_repos = self.inject_repo_data_into_db(models)
				self.schedule_missing_tasks(models, injected_repos)
	Find repositories metadata by parsing the html page (response's raw			self.db_session.commit()
	content). If there are links in the html page, retrieve those
	repositories metadata from those pages as well. Return the			def get_repos(self):
	repositories as list of dictionaries.			"""Generate git 'project' URLs found on the current CGit server

	Args:			"""
	response (Response): http api request response.			next_page = self.url
				while next_page:
	Returns:			idx = BeautifulSoup(self.session.get(next_page).text,
	List of repository origin urls (as dict) included in the response.			features='html.parser')
				for tr in idx.find(
	"""			'div', {"class": "content"}).find_all(
	repos_details = []			"tr", {"class": ""}):
				yield urljoin(self.url, tr.find('a')['href'])
	for repo in self.yield_repo_from_responses(response):
	repo_name = repo.a.text
	origin_url = self.find_origin_url(repo, repo_name)

	try:			try:
				ardumontUnsubmitted Not Done Inline Actions to be used ardumont: to be used
				douarddaAuthorUnsubmitted Done Inline Actions seen that, and also 'gather published "Clone" URLs' (without 'the') douardda: seen that, and also 'gather published "Clone" URLs' (without 'the')
	time = repo.span['title']			pager = idx.find('ul', {'class': 'pager'})
	except Exception:			current_page = pager.find('a', {'class': 'current'})
	time = None			if current_page:
				next_page = current_page.parent.next_sibling.a['href']
	if origin_url is not None:			next_page = urljoin(self.url, next_page)
	repos_details.append({			except (AttributeError, KeyError):
	'name': repo_name,			# no pager, or no next page
	'time': time,			next_page = None
	'origin_url': origin_url,
	})			def build_model(self, repo_url):
				"""Given the URL of a git repo project page on a CGit server,
	random.shuffle(repos_details)			return the repo description (dict) suitable for insertion in the db.
	return repos_details			"""
				bs = BeautifulSoup(self.session.get(repo_url).text,
	def yield_repo_from_responses(self, response):			features='html.parser')
				nahimilegaUnsubmitted Not Done Inline Actions This line is the same as line 57. Maybe we could make a function for this. nahimilega: This line is the same as line 57. Maybe we could make a function for this.
	"""Yield repositories from all pages of the cgit instance.			urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})]

	Finds the number of pages present and yields the list of			if not urls:
	repositories present.			return

	Args:			# look for the http/https url, if any, and use it as origin_url
	response (Response): server response.			for url in urls:
				if urlparse(url).scheme in ('http', 'https'):
	Yields:			origin_url = url
	List of beautifulsoup object of repository rows.			break
				else:
	"""			# otherwise, choose the first one
	html = response.text			origin_url = urls[0]
	yield from get_repo_list(html)
	pages = self.get_pages(make_soup(html))
	if len(pages) > 1:
	yield from self.get_repos_from_pages(pages[1:])

	def find_origin_url(self, repo, repo_name):
	"""Finds the origin url for a repository

	Args:
	repo (Beautifulsoup): Beautifulsoup object of the repository
	row present in base url.
	repo_name (str): Repository name.

	Returns:
	string: origin url.

	"""
	if self.url_prefix_present:
	return self.url_prefix + repo_name

	return self.get_url(repo)

	def get_pages(self, url_soup):
	"""Find URL of all pages.

	Finds URL of pages that are present by parsing over the HTML of
	pagination present at the end of the page.

	Args:
	url_soup (Beautifulsoup): a beautifulsoup object of base URL

	Returns:
	list: URL of pages present for a cgit instance

	"""
	pages = url_soup.find('div', {"class": "content"}).find_all('li')

	if not pages:
	return [self.PAGE]

	return [self.get_url(page) for page in pages]

	def get_repos_from_pages(self, pages):
	"""Find repos from all pages.

	Request the available repos from the pages. This yields
	the available repositories found as beautiful object representation.

	Args:
	pages ([str]): list of urls of all pages present for a
	particular cgit instance.

	Yields:
	List of beautifulsoup object of repository (url) rows
	present in pages(except first).

	"""
	for page in pages:
	response = requests.get(page)
	if not response.ok:
	logging.warning('Failed to retrieve repositories from page %s',
	page)
	continue

	yield from get_repo_list(response.text)

	def get_url(self, repo):
	"""Finds url of a repo page.

	Finds the url of a repo page by parsing over the html of the row of
	that repo present in the base url.

	Args:
	repo (Beautifulsoup): a beautifulsoup object of the repository
	row present in base url.

	Returns:
	string: The url of a repo.

	"""
	suffix = repo.a['href']
	return self.url_netloc + suffix

	def get_model_from_repo(self, repo):
	"""Transform from repository representation to model.

	"""			return {'uid': repo_url,
	return {			'name': bs.find('a', title=re.compile('.+'))['title'],
	'uid': self.PAGE + repo['name'],
	'name': repo['name'],
	'full_name': repo['name'],
	'html_url': repo['origin_url'],
	'origin_url': repo['origin_url'],
	'origin_type': 'git',			'origin_type': 'git',
	'time_updated': repo['time'],
	'instance': self.instance,			'instance': self.instance,
				'origin_url': origin_url,
				ardumontUnsubmitted Not Done Inline Actions I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null values (so it does not break ;). And are also what needs to be further dealt with via T1978 ardumont: I guess the html_url, full_name, etc... and other unpopulated field dbs are defaulting to null…
				douarddaAuthorUnsubmitted Done Inline Actions That's the idea yes. douardda: That's the idea yes.
	}			}

	def transport_response_simplified(self, repos_details):
	"""Transform response to list for model manipulation.

	"""
	return [self.get_model_from_repo(repo) for repo in repos_details]


	def find_netloc(url):
	"""Finds the network location from then url.

	URL in the repo are relative to the network location part of base
	URL, so we need to compute it to reconstruct URLs.

	Args:
	url (urllib): urllib object of url.

	Returns:
	string: Scheme and Network location part in the base URL.

	Example:
	For url = https://git.kernel.org/pub/scm/
	>>> find_netloc(url)
	'https://git.kernel.org'

	"""
	return '%s://%s' % (url.scheme, url.netloc)


	def get_repo_list(response):
	"""Find repositories (as beautifulsoup object) available within the server
	response.

	Args:
	response (Response): server response

	Returns:
	List all repositories as beautifulsoup object within the response.

	"""
	repo_soup = make_soup(response)
	return repo_soup \
	.find('div', {"class": "content"}).find_all("tr", {"class": ""})


	def make_soup(response):
	"""Instantiates a beautiful soup object from the response object.

	"""
	return BeautifulSoup(response, features="html.parser")