diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -24,6 +24,8 @@ Package: python3-swh.lister Architecture: all Depends: python3-swh.scheduler (>= 0.0.14~), + python3-subvertpy (>= 0.9.4~), + python3-suds, ${misc:Depends}, ${python3:Depends} Breaks: python3-swh.lister.github diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ requests setuptools xmltodict +suds_jurko +subvertpy >= 0.9.4 diff --git a/swh/lister/fusionforge/__init__.py b/swh/lister/fusionforge/__init__.py new file mode 100644 diff --git a/swh/lister/fusionforge/lister.py b/swh/lister/fusionforge/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/fusionforge/lister.py @@ -0,0 +1,297 @@ +# Copyright (C) 2017 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import html +import logging +import re +import requests + +from suds.client import Client as SoapClient +from subvertpy.ra import RemoteAccess + +from swh.lister.core.lister_base import SWHListerBase +from swh.lister.core.lister_transports import SWHListerHttpTransport + +from swh.lister.fusionforge.models import FusionForgeModel + +from suds.plugin import MessagePlugin + + +# suds plugin to get the soap reply encoding +class SoapReplyEncodingDetector(MessagePlugin): + def received(self, context): + m = re.match('.*encoding="(.*)"\?>.*', + context.reply.decode('ascii', errors='ignore')) + if m: + self.encoding = m.groups()[0].lower() + else: + self.encoding = 'utf-8' + + +def _check_repo_url(repo_url): + """ + Check if a repository https url is valid. + """ + try: + # send a get request + response = requests.get(repo_url) + # if status code is 200 or 403 (forbidden to browse but + # not to git clone or svn checkout), consider the url valid + if response.status_code in [200, 403]: + return repo_url + return None + except: + # some repos may not be under https, try http then + if repo_url.startswith('https'): + return _check_repo_url(repo_url.replace('https', 'http')) + return None + + +def _try_to_get_origin_url_and_type(ff_baseurl, project): + """ + Try to find git or svn repository url for + a project hosted on a FusionForge instance. + """ + + # standard FusionForge anonymous svn repository url + svn_repo_url = _check_repo_url( + 'https://scm.%s/anonscm/svn/%s/' % (ff_baseurl, project)) + if svn_repo_url: + return svn_repo_url, 'svn' + + # standard FusionForge anonymous git repository url + git_repo_url = _check_repo_url( + 'https://scm.%s/anonscm/git/%s/%s.git' % + (ff_baseurl, project, project)) + if git_repo_url: + return git_repo_url, 'git' + + # other possible FusionForge anonymous git repository url + git_repo_url = _check_repo_url( + 'https://%s/anonscm/git/%s/%s.git' % + (ff_baseurl, project, project)) + if git_repo_url: + return git_repo_url, 'git' + + # some svn repository may only be reference by svn:// url type + # use subvertpy to check their availability + try: + svn_repo_url = 'svn://scm.%s/svn/%s' % (ff_baseurl, project) + RemoteAccess(svn_repo_url) + return svn_repo_url, 'svn' + except: + pass + + # other possible url schemes for git and svn repos (for instance + # those from https://sourcesup.renater.fr) + baseurl_parts = ff_baseurl.split('.') + if len(baseurl_parts) > 2: + for scm in ['subversion', 'svn']: + baseurl_parts[0] = scm + svn_repo_url = 'https://' + '.'.join(baseurl_parts) + '/' + project + '/' # noqa + svn_repo_url = _check_repo_url(svn_repo_url) + if svn_repo_url: + return svn_repo_url, 'svn' + + baseurl_parts[0] = 'git' + git_repo_url = 'https://' + '.'.join(baseurl_parts) + '/' + project + '.git' # noqa + git_repo_url = _check_repo_url(git_repo_url) + if git_repo_url: + return git_repo_url, 'git' + + return None, None + + +def _can_load_svn_repo(svn_repo_url): + """ + Check if a svn repository is valid and + contains at least one revision. + """ + ret = False + try: + conn = RemoteAccess(svn_repo_url) + ret = conn.get_latest_revnum() > 0 + except: + pass + return ret + + +class FusionForgeLister(SWHListerHttpTransport, SWHListerBase): + """ + Lister for FusionForge. + It takes a list of forge urls from the configuration + to list projects from (eg. sourcesup.renater.fr, adullact.net, + gforge.inria.fr) and create swh loading tasks. + To list the projects, the FusionForge SOAP web services are used. + """ + MODEL = FusionForgeModel + PATH_TEMPLATE = None + + @property + def ADDITIONAL_CONFIG(self): # noqa: N802 + config = super().ADDITIONAL_CONFIG + config['fusionforge_credentials'] = ('object', { + 'gforge.inria.fr': { + 'user': None, + 'password': None + } + }) + return config + + def __init__(self, lister_name='fusionforge', override_config=None): + SWHListerHttpTransport.__init__(self, api_baseurl="bogus") + SWHListerBase.__init__(self, lister_name=lister_name, + override_config=override_config) + self.soap_client = None + self.session = '' + + def transport_quota_check(self, response): + return False, 0 + + def transport_request(self, identifier): + """ + Retrieve metadata associated to a FusionForge project + using the getGroupsByName SOAP service. + """ + response = None + try: + response = self.soap_client.getGroupsByName( + self.session, [identifier])[0] + except: + pass + return response + + def process_soap_reply_str(self, text): + """ + Properly decode to utf-8 and HTML unescape a string + parsed by sud from a SOAP reply + """ + encoding = self.soap_reply_handler.encoding + if encoding != 'utf-8': + return html.unescape(text.encode(encoding) + .decode('utf-8')) + else: + return html.unescape(text) + + def transport_response_simplified(self, response): + """ + Process metadata for a FusionForge project and try to find url + for its code repository (only git and svn at the moment). + In that latter case, return a model in order to create a swh + loading task. + """ + ret = [] + if response: + project_name = response.unix_group_name + # if the FusionForge SOAP API has the repositoryapi_* service + # we already have put in cache the code repository urls + if project_name in self.repos_data: + origin_url = self.repos_data[project_name].repository_urls[0] + origin_type = self.repos_data[project_name].repository_type + # otherwise we try to find the repository url + else: + origin_url, origin_type = _try_to_get_origin_url_and_type( + self.fusionforge_baseurl, project_name) + + # filter out invalid or empty svn repos + if origin_type == 'svn' and not _can_load_svn_repo(origin_url): + origin_url = None + + if origin_url: + ret.append({'uid': project_name, + 'indexable': project_name, + 'name': project_name, + 'full_name': self.process_soap_reply_str(response.group_name), # noqa + 'html_url': response.homepage, + 'origin_url': origin_url, + 'origin_type': origin_type, + 'description': self.process_soap_reply_str(response.short_description) # noqa + }) + else: + logging.info('Unable to find git or svn repository url ' + 'for project %s from FusionForge %s' % + (project_name, self.fusionforge_baseurl)) + return ret + + def run(self, forge_baseurl): + """ + Run the lister. + """ + + self.fusionforge_baseurl = forge_baseurl + forge_wsdl_url = 'https://%s/soap/?wsdl=1' % \ + self.fusionforge_baseurl + + # initialize SOAP client + try: + self.soap_reply_handler = SoapReplyEncodingDetector() + self.soap_client = SoapClient(forge_wsdl_url, + plugins=[self.soap_reply_handler] + ).service + except: + logging.error( + 'Unable to initialize web service client from url %s' % + forge_wsdl_url) + return False + + # case where login is required to use the web services + user = None + password = None + credentials = self.config.get('fusionforge_credentials', None) + if credentials: + forge_credentials = credentials.get(forge_baseurl, None) + if forge_credentials: + user = forge_credentials.get('user', None) + password = forge_credentials.get('password', None) + if user and password: + try: + self.session = self.soap_client.login(user, password) + except: + logging.error( + 'Failed to authenticate user %s on %s' % + (user, self.fusionforge_baseurl)) + return False + + self.repos_data = {} + projects = [] + + # some forge (for instance adullact.net) have deployed a service + # to list code repositories, try to use it + try: + repos_list = self.soap_client.repositoryapi_repositoryList(self.session) # noqa + + # create a cache holding repositories data for latter use + # in the listing process + for repo in repos_list: + project_name = repo.repository_id.split('/')[-1] + projects.append(project_name) + self.repos_data[project_name] = repo + except: + pass + + # get all public project names on the forge + public_projects = self.soap_client.getPublicProjectNames(self.session) # noqa + projects = set(projects) | set(public_projects) + + # process the projects and try to find their code repository + for project in sorted(list(projects)): + self.ingest_data(project) + self.db_session.commit() + self.db_session = self.mk_session() + + return True + + def task_dict(self, origin_type, origin_url): + """ + Create scheduler task for loading the found + repositories during projects listing. + """ + tsk_dict = super().task_dict(origin_type, origin_url) + # set one shot task for the moment + tsk_dict['policy'] = 'oneshot' + # required parameters for the svn loader + if origin_type == 'svn': + tsk_dict['arguments']['kwargs']['destination_path'] = '/tmp' + tsk_dict['arguments']['kwargs']['svn_url'] = origin_url + return tsk_dict diff --git a/swh/lister/fusionforge/models.py b/swh/lister/fusionforge/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/fusionforge/models.py @@ -0,0 +1,15 @@ +# Copyright (C) 2017 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from swh.lister.core.models import ModelBase + + +class FusionForgeModel(ModelBase): + """a FusionForge repository""" + __tablename__ = 'fusionforge_repos' + + uid = Column(String, primary_key=True) + indexable = Column(String, index=True) diff --git a/swh/lister/fusionforge/tasks.py b/swh/lister/fusionforge/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/fusionforge/tasks.py @@ -0,0 +1,18 @@ +# Copyright (C) 2017 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.core.tasks import ListerTaskBase # noqa + +from .lister import FusionForgeLister + + +class FusionForgeListerTask(ListerTaskBase): + task_queue = 'swh_lister_fusionforge' + + def new_lister(self): + return FusionForgeLister() + + def run_task(self, forge_baseurl): + lister = self.new_lister() + return lister.run(forge_baseurl)