Page MenuHomeSoftware Heritage

D267.id887.diff
No OneTemporary

D267.id887.diff

diff --git a/debian/control b/debian/control
--- a/debian/control
+++ b/debian/control
@@ -24,6 +24,8 @@
Package: python3-swh.lister
Architecture: all
Depends: python3-swh.scheduler (>= 0.0.14~),
+ python3-subvertpy (>= 0.9.4~),
+ python3-suds,
${misc:Depends},
${python3:Depends}
Breaks: python3-swh.lister.github
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@
requests
setuptools
xmltodict
+suds_jurko
+subvertpy >= 0.9.4
diff --git a/swh/lister/fusionforge/__init__.py b/swh/lister/fusionforge/__init__.py
new file mode 100644
diff --git a/swh/lister/fusionforge/lister.py b/swh/lister/fusionforge/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/fusionforge/lister.py
@@ -0,0 +1,269 @@
+# Copyright (C) 2017 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+import requests
+
+from suds.client import Client as SoapClient
+from subvertpy.ra import RemoteAccess
+
+from swh.lister.core.lister_base import SWHListerBase
+from swh.lister.core.lister_transports import SWHListerHttpTransport
+
+from swh.lister.fusionforge.models import FusionForgeModel
+
+
+def _check_repo_url(repo_url):
+ """
+ Check if a repository https url is valid.
+ """
+ try:
+ # send a get request
+ response = requests.get(repo_url)
+ # if status code is 200 or 403 (forbidden to browse but
+ # not to git clone or svn checkout), consider the url valid
+ if response.status_code in [200, 403]:
+ return repo_url
+ return None
+ except:
+ # some repos may not be under https, try http then
+ if repo_url.startswith('https'):
+ return _check_repo_url(repo_url.replace('https', 'http'))
+ return None
+
+
+def _try_to_get_origin_url_and_type(ff_baseurl, project):
+ """
+ Try to find git or svn repository url for
+ a project hosted on a FusionForge instance.
+ """
+
+ # standard FusionForge anonymous svn repository url
+ svn_repo_url = _check_repo_url(
+ 'https://scm.%s/anonscm/svn/%s/' % (ff_baseurl, project))
+ if svn_repo_url:
+ return svn_repo_url, 'svn'
+
+ # standard FusionForge anonymous git repository url
+ git_repo_url = _check_repo_url(
+ 'https://scm.%s/anonscm/git/%s/%s.git' %
+ (ff_baseurl, project, project))
+ if git_repo_url:
+ return git_repo_url, 'git'
+
+ # other possible FusionForge anonymous git repository url
+ git_repo_url = _check_repo_url(
+ 'https://%s/anonscm/git/%s/%s.git' %
+ (ff_baseurl, project, project))
+ if git_repo_url:
+ return git_repo_url, 'git'
+
+ # some svn repository may only be reference by svn:// url type
+ # use subvertpy to check their availability
+ try:
+ svn_repo_url = 'svn://scm.%s/svn/%s' % (ff_baseurl, project)
+ RemoteAccess(svn_repo_url)
+ return svn_repo_url, 'svn'
+ except:
+ pass
+
+ # other possible url schemes for git and svn repos (for instance
+ # those from https://sourcesup.renater.fr)
+ baseurl_parts = ff_baseurl.split('.')
+ if len(baseurl_parts) > 2:
+ for scm in ['subversion', 'svn']:
+ baseurl_parts[0] = scm
+ svn_repo_url = 'https://' + '.'.join(baseurl_parts) + '/' + project + '/' # noqa
+ svn_repo_url = _check_repo_url(svn_repo_url)
+ if svn_repo_url:
+ return svn_repo_url, 'svn'
+
+ baseurl_parts[0] = 'git'
+ git_repo_url = 'https://' + '.'.join(baseurl_parts) + '/' + project + '.git' # noqa
+ git_repo_url = _check_repo_url(git_repo_url)
+ if git_repo_url:
+ return git_repo_url, 'git'
+
+ return None, None
+
+
+def _can_load_svn_repo(svn_repo_url):
+ """
+ Check if a svn repository is valid and
+ contains at least one revision.
+ """
+ ret = False
+ try:
+ conn = RemoteAccess(svn_repo_url)
+ ret = conn.get_latest_revnum() > 0
+ except:
+ pass
+ return ret
+
+
+class FusionForgeLister(SWHListerHttpTransport, SWHListerBase):
+ """
+ Lister for FusionForge.
+ It takes a list of forge urls from the configuration
+ to list projects from (eg. sourcesup.renater.fr, adullact.net,
+ gforge.inria.fr) and create swh loading tasks.
+ To list the projects, the FusionForge SOAP web services are used.
+ """
+ MODEL = FusionForgeModel
+ PATH_TEMPLATE = None
+
+ @property
+ def ADDITIONAL_CONFIG(self): # noqa: N802
+ config = super().ADDITIONAL_CONFIG
+ config['fusionforge_origins'] = ('list',
+ [{'baseurl': 'gforge.inria.fr',
+ 'user': None,
+ 'password': None}])
+ return config
+
+ def __init__(self, lister_name='fusionforge', override_config=None):
+ SWHListerHttpTransport.__init__(self, api_baseurl="bogus")
+ SWHListerBase.__init__(self, lister_name=lister_name,
+ override_config=override_config)
+ self.soap_client = None
+ self.session = ''
+
+ def transport_quota_check(self, response):
+ return False, 0
+
+ def transport_request(self, identifier):
+ """
+ Retrieve metadata associated to a FusionForge project
+ using the getGroupsByName SOAP service.
+ """
+ response = None
+ try:
+ response = self.soap_client.getGroupsByName(
+ self.session, [identifier])[0]
+ except:
+ pass
+ return response
+
+ def transport_response_simplified(self, response):
+ """
+ Process metadata for a FusionForge project and try to find url
+ for its code repository (only git and svn at the moment).
+ In that latter case, return a model in order to create a swh
+ loading task.
+ """
+ ret = []
+ if response:
+ project_name = response.unix_group_name
+ # if the FusionForge SOAP API has the repositoryapi_* service
+ # we already have put in cache the code repository urls
+ if project_name in self.repos_data:
+ origin_url = self.repos_data[project_name].repository_urls[0]
+ origin_type = self.repos_data[project_name].repository_type
+ # otherwise we try to find the repository url
+ else:
+ origin_url, origin_type = _try_to_get_origin_url_and_type(
+ self.fusionforge_baseurl, project_name)
+
+ # filter out invalid or empty svn repos
+ if origin_type == 'svn' and not _can_load_svn_repo(origin_url):
+ origin_url = None
+
+ if origin_url:
+ ret.append({'uid': project_name,
+ 'indexable': project_name,
+ 'name': project_name,
+ 'full_name': response.group_name,
+ 'html_url': response.homepage,
+ 'origin_url': origin_url,
+ 'origin_type': origin_type,
+ 'description': response.short_description
+ })
+ else:
+ logging.info('Unable to find git or svn repository url '
+ 'for project %s from FusionForge %s' %
+ (project_name, self.fusionforge_baseurl))
+ return ret
+
+ def run(self):
+ """
+ Run the lister.
+ """
+ fusionforge_origins = self.config['fusionforge_origins']
+
+ ret = False
+
+ for fusionforge_origin in fusionforge_origins:
+ self.fusionforge_baseurl = fusionforge_origin['baseurl']
+ forge_wsdl_url = 'https://%s/soap/?wsdl=1' % \
+ self.fusionforge_baseurl
+
+ # initialize SOAP client
+ try:
+ self.soap_client = SoapClient(forge_wsdl_url).service
+ except:
+ logging.error(
+ 'Unable to initialize web service client from url %s' %
+ forge_wsdl_url)
+ continue
+
+ # case where login is required to use the web services
+ user = fusionforge_origin.get('user', None)
+ password = fusionforge_origin.get('password', None)
+ if user and password:
+ try:
+ self.session = self.soap_client.login(user, password)
+ except:
+ logging.error(
+ 'Failed to authenticate user %s on %s' %
+ (user, self.fusionforge_baseurl))
+ continue
+
+ self.repos_data = {}
+ projects = []
+
+ # some forge (for instance adullact.net) have deployed a service
+ # to list code repositories, try to use it
+ try:
+ repos_list = self.soap_client.repositoryapi_repositoryList(self.session) # noqa
+
+ # create a cache holding repositories data for latter use
+ # in the listing process
+ for repo in repos_list:
+ project_name = repo.repository_id.split('/')[-1]
+ projects.append(project_name)
+ self.repos_data[project_name] = repo
+ except:
+ pass
+
+ # get all public project names on the forge
+ public_projects = self.soap_client.getPublicProjectNames(self.session) # noqa
+ projects = set(projects) | set(public_projects)
+
+ # process the projects and try to find their code repository
+ loop_count = 0
+ for project in projects:
+ self.ingest_data(project)
+ loop_count += 1
+ if loop_count == 20:
+ loop_count = 0
+ self.db_session.commit()
+ self.db_session = self.mk_session()
+
+ ret = True
+
+ return ret
+
+ def task_dict(self, origin_type, origin_url):
+ """
+ Create scheduler task for loading the found
+ repositories during projects listing.
+ """
+ tsk_dict = super().task_dict(origin_type, origin_url)
+ # set one shot task for the moment
+ tsk_dict['policy'] = 'oneshot'
+ # required parameters for the svn loader
+ if origin_type == 'svn':
+ tsk_dict['arguments']['kwargs']['destination_path'] = '/tmp'
+ tsk_dict['arguments']['kwargs']['svn_url'] = origin_url
+ return tsk_dict
diff --git a/swh/lister/fusionforge/models.py b/swh/lister/fusionforge/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/fusionforge/models.py
@@ -0,0 +1,15 @@
+# Copyright (C) 2017 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from swh.lister.core.models import ModelBase
+
+
+class FusionForgeModel(ModelBase):
+ """a FusionForge repository"""
+ __tablename__ = 'fusionforge_repos'
+
+ uid = Column(String, primary_key=True)
+ indexable = Column(String, index=True)
diff --git a/swh/lister/fusionforge/tasks.py b/swh/lister/fusionforge/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/fusionforge/tasks.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2017 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.core.tasks import ListerTaskBase # noqa
+
+from .lister import FusionForgeLister
+
+
+class FusionForgeListerTask(ListerTaskBase):
+ task_queue = 'swh_lister_fusionforge'
+
+ def new_lister(self):
+ return FusionForgeLister()
+
+ def run_task(self):
+ lister = self.new_lister()
+ return lister.run()

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 7:01 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230934

Event Timeline