Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/fusionforge/lister.py
- This file was added.
# Copyright (C) 2017 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import html | |||||
import logging | |||||
import re | |||||
import requests | |||||
from suds.client import Client as SoapClient | |||||
from subvertpy.ra import RemoteAccess | |||||
from swh.lister.core.lister_base import SWHListerBase | |||||
from swh.lister.core.lister_transports import SWHListerHttpTransport | |||||
from swh.lister.fusionforge.models import FusionForgeModel | |||||
from suds.plugin import MessagePlugin | |||||
# suds plugin to get the soap reply encoding | |||||
class SoapReplyEncodingDetector(MessagePlugin): | |||||
def received(self, context): | |||||
m = re.match('.*encoding="(.*)"\?>.*', | |||||
context.reply.decode('ascii', errors='ignore')) | |||||
if m: | |||||
self.encoding = m.groups()[0].lower() | |||||
else: | |||||
self.encoding = 'utf-8' | |||||
def _check_repo_url(repo_url): | |||||
""" | |||||
Check if a repository https url is valid. | |||||
""" | |||||
try: | |||||
# send a get request | |||||
response = requests.get(repo_url) | |||||
# if status code is 200 or 403 (forbidden to browse but | |||||
# not to git clone or svn checkout), consider the url valid | |||||
if response.status_code in [200, 403]: | |||||
return repo_url | |||||
return None | |||||
except: | |||||
# some repos may not be under https, try http then | |||||
if repo_url.startswith('https'): | |||||
return _check_repo_url(repo_url.replace('https', 'http')) | |||||
return None | |||||
def _try_to_get_origin_url_and_type(ff_baseurl, project): | |||||
""" | |||||
Try to find git or svn repository url for | |||||
a project hosted on a FusionForge instance. | |||||
""" | |||||
# standard FusionForge anonymous svn repository url | |||||
svn_repo_url = _check_repo_url( | |||||
'https://scm.%s/anonscm/svn/%s/' % (ff_baseurl, project)) | |||||
if svn_repo_url: | |||||
return svn_repo_url, 'svn' | |||||
# standard FusionForge anonymous git repository url | |||||
git_repo_url = _check_repo_url( | |||||
'https://scm.%s/anonscm/git/%s/%s.git' % | |||||
(ff_baseurl, project, project)) | |||||
if git_repo_url: | |||||
return git_repo_url, 'git' | |||||
# other possible FusionForge anonymous git repository url | |||||
git_repo_url = _check_repo_url( | |||||
'https://%s/anonscm/git/%s/%s.git' % | |||||
(ff_baseurl, project, project)) | |||||
if git_repo_url: | |||||
return git_repo_url, 'git' | |||||
# some svn repository may only be reference by svn:// url type | |||||
# use subvertpy to check their availability | |||||
try: | |||||
svn_repo_url = 'svn://scm.%s/svn/%s' % (ff_baseurl, project) | |||||
RemoteAccess(svn_repo_url) | |||||
return svn_repo_url, 'svn' | |||||
except: | |||||
pass | |||||
# other possible url schemes for git and svn repos (for instance | |||||
# those from https://sourcesup.renater.fr) | |||||
baseurl_parts = ff_baseurl.split('.') | |||||
if len(baseurl_parts) > 2: | |||||
for scm in ['subversion', 'svn']: | |||||
baseurl_parts[0] = scm | |||||
svn_repo_url = 'https://' + '.'.join(baseurl_parts) + '/' + project + '/' # noqa | |||||
svn_repo_url = _check_repo_url(svn_repo_url) | |||||
if svn_repo_url: | |||||
return svn_repo_url, 'svn' | |||||
baseurl_parts[0] = 'git' | |||||
git_repo_url = 'https://' + '.'.join(baseurl_parts) + '/' + project + '.git' # noqa | |||||
git_repo_url = _check_repo_url(git_repo_url) | |||||
if git_repo_url: | |||||
return git_repo_url, 'git' | |||||
return None, None | |||||
def _can_load_svn_repo(svn_repo_url): | |||||
""" | |||||
Check if a svn repository is valid and | |||||
contains at least one revision. | |||||
""" | |||||
ret = False | |||||
try: | |||||
conn = RemoteAccess(svn_repo_url) | |||||
ret = conn.get_latest_revnum() > 0 | |||||
except: | |||||
pass | |||||
return ret | |||||
class FusionForgeLister(SWHListerHttpTransport, SWHListerBase): | |||||
""" | |||||
Lister for FusionForge. | |||||
It takes a list of forge urls from the configuration | |||||
to list projects from (eg. sourcesup.renater.fr, adullact.net, | |||||
gforge.inria.fr) and create swh loading tasks. | |||||
To list the projects, the FusionForge SOAP web services are used. | |||||
""" | |||||
MODEL = FusionForgeModel | |||||
PATH_TEMPLATE = None | |||||
@property | |||||
def ADDITIONAL_CONFIG(self): # noqa: N802 | |||||
config = super().ADDITIONAL_CONFIG | |||||
config['fusionforge_credentials'] = ('object', { | |||||
'gforge.inria.fr': { | |||||
'user': None, | |||||
'password': None | |||||
} | |||||
}) | |||||
return config | |||||
def __init__(self, lister_name='fusionforge', override_config=None): | |||||
SWHListerHttpTransport.__init__(self, api_baseurl="bogus") | |||||
SWHListerBase.__init__(self, lister_name=lister_name, | |||||
override_config=override_config) | |||||
self.soap_client = None | |||||
self.session = '' | |||||
def transport_quota_check(self, response): | |||||
return False, 0 | |||||
def transport_request(self, identifier): | |||||
""" | |||||
Retrieve metadata associated to a FusionForge project | |||||
using the getGroupsByName SOAP service. | |||||
""" | |||||
response = None | |||||
try: | |||||
response = self.soap_client.getGroupsByName( | |||||
self.session, [identifier])[0] | |||||
except: | |||||
pass | |||||
return response | |||||
def process_soap_reply_str(self, text): | |||||
""" | |||||
Properly decode to utf-8 and HTML unescape a string | |||||
parsed by sud from a SOAP reply | |||||
""" | |||||
encoding = self.soap_reply_handler.encoding | |||||
if encoding != 'utf-8': | |||||
return html.unescape(text.encode(encoding) | |||||
.decode('utf-8')) | |||||
else: | |||||
return html.unescape(text) | |||||
def transport_response_simplified(self, response): | |||||
""" | |||||
Process metadata for a FusionForge project and try to find url | |||||
for its code repository (only git and svn at the moment). | |||||
In that latter case, return a model in order to create a swh | |||||
loading task. | |||||
""" | |||||
ret = [] | |||||
if response: | |||||
project_name = response.unix_group_name | |||||
# if the FusionForge SOAP API has the repositoryapi_* service | |||||
# we already have put in cache the code repository urls | |||||
if project_name in self.repos_data: | |||||
origin_url = self.repos_data[project_name].repository_urls[0] | |||||
origin_type = self.repos_data[project_name].repository_type | |||||
# otherwise we try to find the repository url | |||||
else: | |||||
origin_url, origin_type = _try_to_get_origin_url_and_type( | |||||
self.fusionforge_baseurl, project_name) | |||||
# filter out invalid or empty svn repos | |||||
if origin_type == 'svn' and not _can_load_svn_repo(origin_url): | |||||
origin_url = None | |||||
if origin_url: | |||||
ret.append({'uid': project_name, | |||||
'indexable': project_name, | |||||
'name': project_name, | |||||
'full_name': self.process_soap_reply_str(response.group_name), # noqa | |||||
'html_url': response.homepage, | |||||
'origin_url': origin_url, | |||||
'origin_type': origin_type, | |||||
'description': self.process_soap_reply_str(response.short_description) # noqa | |||||
}) | |||||
else: | |||||
logging.info('Unable to find git or svn repository url ' | |||||
'for project %s from FusionForge %s' % | |||||
(project_name, self.fusionforge_baseurl)) | |||||
return ret | |||||
def run(self, forge_baseurl): | |||||
""" | |||||
Run the lister. | |||||
""" | |||||
self.fusionforge_baseurl = forge_baseurl | |||||
forge_wsdl_url = 'https://%s/soap/?wsdl=1' % \ | |||||
self.fusionforge_baseurl | |||||
# initialize SOAP client | |||||
try: | |||||
self.soap_reply_handler = SoapReplyEncodingDetector() | |||||
self.soap_client = SoapClient(forge_wsdl_url, | |||||
plugins=[self.soap_reply_handler] | |||||
).service | |||||
except: | |||||
logging.error( | |||||
'Unable to initialize web service client from url %s' % | |||||
forge_wsdl_url) | |||||
return False | |||||
# case where login is required to use the web services | |||||
user = None | |||||
password = None | |||||
credentials = self.config.get('fusionforge_credentials', None) | |||||
if credentials: | |||||
forge_credentials = credentials.get(forge_baseurl, None) | |||||
if forge_credentials: | |||||
user = forge_credentials.get('user', None) | |||||
password = forge_credentials.get('password', None) | |||||
if user and password: | |||||
try: | |||||
self.session = self.soap_client.login(user, password) | |||||
except: | |||||
logging.error( | |||||
'Failed to authenticate user %s on %s' % | |||||
(user, self.fusionforge_baseurl)) | |||||
return False | |||||
self.repos_data = {} | |||||
projects = [] | |||||
# some forge (for instance adullact.net) have deployed a service | |||||
# to list code repositories, try to use it | |||||
try: | |||||
repos_list = self.soap_client.repositoryapi_repositoryList(self.session) # noqa | |||||
# create a cache holding repositories data for latter use | |||||
# in the listing process | |||||
for repo in repos_list: | |||||
project_name = repo.repository_id.split('/')[-1] | |||||
projects.append(project_name) | |||||
self.repos_data[project_name] = repo | |||||
except: | |||||
pass | |||||
# get all public project names on the forge | |||||
public_projects = self.soap_client.getPublicProjectNames(self.session) # noqa | |||||
projects = set(projects) | set(public_projects) | |||||
# process the projects and try to find their code repository | |||||
for project in sorted(list(projects)): | |||||
self.ingest_data(project) | |||||
self.db_session.commit() | |||||
self.db_session = self.mk_session() | |||||
return True | |||||
def task_dict(self, origin_type, origin_url): | |||||
""" | |||||
Create scheduler task for loading the found | |||||
repositories during projects listing. | |||||
""" | |||||
tsk_dict = super().task_dict(origin_type, origin_url) | |||||
# set one shot task for the moment | |||||
tsk_dict['policy'] = 'oneshot' | |||||
# required parameters for the svn loader | |||||
if origin_type == 'svn': | |||||
tsk_dict['arguments']['kwargs']['destination_path'] = '/tmp' | |||||
tsk_dict['arguments']['kwargs']['svn_url'] = origin_url | |||||
return tsk_dict |