lister.py
View Options

	# Copyright (C) 2019 the Software Heritage developers
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import re
	import logging
	from urllib.parse import urlparse, urljoin

	from bs4 import BeautifulSoup
	from requests import Session
	from requests.adapters import HTTPAdapter

	from .models import CGitModel

	from swh.core.utils import grouper
	from swh.lister.core.lister_base import ListerBase


	logger = logging.getLogger(__name__)


	class CGitLister(ListerBase):
	"""Lister class for CGit repositories.

	This lister will retrieve the list of published git repositories by
	parsing the HTML page(s) of the index retrieved at `url`.

	For each found git repository, a query is made at the given url found
	in this index to gather published "Clone" URLs to be used as origin
	URL for that git repo.

	If several "Clone" urls are provided, prefer the http/https one, if
	any, otherwise fall bak to the first one.

	A loader task is created for each git repository:

	Task:
	Type: load-git
	Policy: recurring
	Args:
	<git_clonable_url>

	Example:
	Type: load-git
	Policy: recurring
	Args:
	'https://git.savannah.gnu.org/git/elisp-es.git'
	"""
	MODEL = CGitModel
	DEFAULT_URL = 'https://git.savannah.gnu.org/cgit/'
	LISTER_NAME = 'cgit'
	url_prefix_present = True

	def __init__(self, url=None, instance=None, override_config=None):
	"""Lister class for CGit repositories.

	Args:
	url (str): main URL of the CGit instance, i.e. url of the index
	of published git repositories on this instance.
	instance (str): Name of cgit instance. Defaults to url's hostname
	if unset.

	"""
	super().__init__(override_config=override_config)

	if url is None:
	url = self.config.get('url', self.DEFAULT_URL)
	self.url = url

	if not instance:
	instance = urlparse(url).hostname
	self.instance = instance
	self.session = Session()
	self.session.mount(self.url, HTTPAdapter(max_retries=3))

	def run(self):
	total = 0
	for repos in grouper(self.get_repos(), 10):
	models = list(filter(None, (self.build_model(repo)
	for repo in repos)))
	injected_repos = self.inject_repo_data_into_db(models)
	self.schedule_missing_tasks(models, injected_repos)
	self.db_session.commit()
	total += len(injected_repos)
	logger.debug('Scheduled %s tasks for %s', total, self.url)

	def get_repos(self):
	"""Generate git 'project' URLs found on the current CGit server

	"""
	next_page = self.url
	while next_page:
	bs_idx = self.get_and_parse(next_page)
	for tr in bs_idx.find(
	'div', {"class": "content"}).find_all(
	"tr", {"class": ""}):
	yield urljoin(self.url, tr.find('a')['href'])

	try:
	pager = bs_idx.find('ul', {'class': 'pager'})
	current_page = pager.find('a', {'class': 'current'})
	if current_page:
	next_page = current_page.parent.next_sibling.a['href']
	next_page = urljoin(self.url, next_page)
	except (AttributeError, KeyError):
	# no pager, or no next page
	next_page = None

	def build_model(self, repo_url):
	"""Given the URL of a git repo project page on a CGit server,
	return the repo description (dict) suitable for insertion in the db.
	"""
	bs = self.get_and_parse(repo_url)
	urls = [x['href'] for x in bs.find_all('a', {'rel': 'vcs-git'})]

	if not urls:
	return

	# look for the http/https url, if any, and use it as origin_url
	for url in urls:
	if urlparse(url).scheme in ('http', 'https'):
	origin_url = url
	break
	else:
	# otherwise, choose the first one
	origin_url = urls[0]

	return {'uid': repo_url,
	'name': bs.find('a', title=re.compile('.+'))['title'],
	'origin_type': 'git',
	'instance': self.instance,
	'origin_url': origin_url,
	}

	def get_and_parse(self, url):
	"Get the given url and parse the retrieved HTML using BeautifulSoup"
	return BeautifulSoup(self.session.get(url).text,
	features='html.parser')

File Metadata

Mime Type: text/x-python
Expires: Jul 4 2025, 8:48 AM (7 w, 14 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3240997

lister.py
No OneTemporary
Actions

lister.py
View Options

File Metadata

Event Timeline

lister.pyNo OneTemporaryActions

lister.pyView Options

File Metadata

Event Timeline

lister.py
No OneTemporary
Actions

lister.py
View Options