Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/sourceforge/lister.py
- This file was added.
# Copyright (C) 2017 the Software Heritage developers | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
# Lister for projects hosted on SourceForge. | |||||
# As the SourceForge REST API does not enable to list projects, | |||||
# we will use the rsync mirror of files hosted on SourceForge | |||||
# to retrieve the projects names (we will surely miss some but | |||||
# most important ones should be retrieved) | |||||
from swh.lister.sourceforge.models import SourceForgeModel | |||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister | |||||
import bisect | |||||
import re | |||||
import requests | |||||
import subprocess | |||||
import string | |||||
# url of rsync mirror for sourceforge projects | |||||
_sf_mirror_rsync_baseurl = \ | |||||
'rsync://rsync.mirrorservice.org/downloads.sourceforge.net' | |||||
# rsync://netix.dl.sourceforge.net/sfmir | |||||
# sample output when using rsync to list sourceforge projects | |||||
# $ rsync --list-only rsync://netix.dl.sourceforge.net/sfmir/a/ | |||||
# drwxr-xr-x 4,096 2016/08/29 20:02:18 . | |||||
# drwxr-xr-x 126 2017/11/01 01:24:09 a- | |||||
# drwxr-xr-x 10 2017/01/17 22:53:53 a0 | |||||
# drwxr-xr-x 47 2017/08/05 03:34:16 a1 | |||||
# drwxr-xr-x 4,096 2017/11/02 01:28:17 a2 | |||||
# drwxr-xr-x 105 2017/10/07 02:27:09 a3 | |||||
# drwxr-xr-x 29 2016/12/30 05:15:03 a4 | |||||
# drwxr-xr-x 10 2017/09/12 23:08:25 a5 | |||||
# drwxr-xr-x 29 2017/07/14 02:29:20 a6 | |||||
# drwxr-xr-x 10 2017/01/17 22:53:58 a7 | |||||
# drwxr-xr-x 40 2016/08/27 14:41:11 a8 | |||||
# drwxr-xr-x 10 2017/01/17 22:53:58 a9 | |||||
# drwxr-xr-x 4,096 2017/10/27 23:18:03 aa | |||||
# drwxr-xr-x 4,096 2017/10/12 10:54:39 ab | |||||
# drwxr-xr-x 8,192 2017/10/31 02:23:46 ac | |||||
# drwxr-xr-x 12,288 2017/11/01 17:05:02 ad | |||||
# drwxr-xr-x 4,096 2017/10/31 01:28:09 ae | |||||
# drwxr-xr-x 4,096 2017/11/01 01:24:09 af | |||||
# drwxr-xr-x 4,096 2017/10/29 23:25:01 ag | |||||
# drwxr-xr-x 4,096 2017/09/11 20:25:36 ah | |||||
# drwxr-xr-x 8,192 2017/10/20 17:46:55 ai | |||||
# drwxr-xr-x 4,096 2017/09/24 20:00:01 aj | |||||
# drwxr-xr-x 4,096 2017/09/25 16:30:01 ak | |||||
# drwxr-xr-x 12,288 2017/11/02 05:56:13 al | |||||
# drwxr-xr-x 8,192 2017/10/23 23:52:57 am | |||||
# drwxr-xr-x 20,480 2017/11/01 14:59:02 an | |||||
# drwxr-xr-x 4,096 2017/10/02 07:00:01 ao | |||||
# drwxr-xr-x 12,288 2017/10/31 14:15:01 ap | |||||
# drwxr-xr-x 4,096 2017/10/02 18:13:44 aq | |||||
# drwxr-xr-x 16,384 2017/10/29 14:50:01 ar | |||||
# drwxr-xr-x 16,384 2017/10/31 21:15:01 as | |||||
# drwxr-xr-x 8,192 2017/10/26 20:29:25 at | |||||
# drwxr-xr-x 16,384 2017/10/30 16:25:01 au | |||||
# drwxr-xr-x 4,096 2017/10/24 00:24:00 av | |||||
# drwxr-xr-x 4,096 2017/10/05 01:00:01 aw | |||||
# drwxr-xr-x 4,096 2017/10/27 02:26:58 ax | |||||
# drwxr-xr-x 4,096 2017/10/25 22:35:02 ay | |||||
# drwxr-xr-x 4,096 2017/10/15 17:55:01 az | |||||
# set of characters for the first letter of a folder containing sf projects | |||||
_sf_subdir_first_char_set = string.ascii_lowercase | |||||
# set of characters for the second letter of a folder containing sf projects | |||||
_sf_subdir_second_char_set = '-' + string.digits + string.ascii_lowercase | |||||
# cache for rsync listing ouput | |||||
_projects_list_cache = {} | |||||
def _list_sf_projects_in_subdir(sf_mirror_rsync_baseurl, subdir): | |||||
""" | |||||
Utility function to list sourceforge projects with rsync | |||||
located in the folder xy from url (sf_mirror_rsync_baseurl)/(x)/(x)(y)/ | |||||
""" | |||||
if subdir not in _projects_list_cache: | |||||
projects = [] | |||||
try: | |||||
# call rsync to list the desired folder | |||||
output = subprocess.check_output( | |||||
["rsync", "--list-only", "%s/%s/%s/" % | |||||
(sf_mirror_rsync_baseurl, subdir[0], subdir)], | |||||
stderr=subprocess.STDOUT) | |||||
# iterate over response lines | |||||
lines = output.decode('utf-8').split('\n') | |||||
for line in lines: | |||||
# the line corresponds to a folder | |||||
if line.startswith('drwxr-xr-x'): | |||||
columns = line.split() | |||||
# only consider folders whose first letter is the same as | |||||
# the one from the listed folder | |||||
if columns[4].startswith(subdir[0]): | |||||
projects.append(columns[4]) | |||||
except: | |||||
pass | |||||
# put retrieved projects list in cache | |||||
_projects_list_cache[subdir] = sorted(projects) | |||||
return _projects_list_cache[subdir] | |||||
def _next_sf_project_first_chars(current_first_chars=''): | |||||
""" | |||||
Utility function to get the next projects folder name to list with rsync. | |||||
For instance, aa -> ab, fg -> fh, gz -> h-, h7 -> h8, ... | |||||
""" | |||||
if len(current_first_chars) > 0: | |||||
first_char = current_first_chars[0] | |||||
else: | |||||
first_char = 'a' | |||||
if len(current_first_chars) > 1: | |||||
second_char = current_first_chars[1] | |||||
else: | |||||
second_char = '-' | |||||
if first_char == 'z' and second_char == 'z': | |||||
return None | |||||
elif second_char == 'z': | |||||
first_char_idx = _sf_subdir_first_char_set.index(first_char) | |||||
return _sf_subdir_first_char_set[first_char_idx+1] + \ | |||||
_sf_subdir_second_char_set[0] | |||||
else: | |||||
second_char_idx = _sf_subdir_second_char_set.index(second_char) | |||||
return first_char + _sf_subdir_second_char_set[second_char_idx+1] | |||||
def _next_sf_project(sf_mirror_rsync_baseurl, current_project=''): | |||||
""" | |||||
Utility function to get the next sourceforge project name. | |||||
""" | |||||
first_chars = None | |||||
if current_project: | |||||
first_chars = current_project[:2] | |||||
if len(current_project) < 2: | |||||
first_chars += '-' | |||||
else: | |||||
first_chars = 'a-' | |||||
projects = _list_sf_projects_in_subdir(sf_mirror_rsync_baseurl, | |||||
first_chars) | |||||
if not current_project: | |||||
return projects[0] | |||||
else: | |||||
idx = bisect.bisect_left(projects, current_project) | |||||
if idx < len(projects) - 1: | |||||
return projects[idx+1] | |||||
else: | |||||
next_f_chars = _next_sf_project_first_chars(first_chars) | |||||
if not next_f_chars: | |||||
return None | |||||
next_first_chars_ok = False | |||||
while not next_first_chars_ok: | |||||
projects = _list_sf_projects_in_subdir( | |||||
sf_mirror_rsync_baseurl, next_f_chars) | |||||
if len(projects) > 0: | |||||
next_first_chars_ok = True | |||||
else: | |||||
next_f_chars = _next_sf_project_first_chars(next_f_chars) | |||||
return projects[0] | |||||
class SourceForgeLister(SWHIndexingHttpLister): | |||||
PATH_TEMPLATE = '/rest/p/%s/' | |||||
MODEL = SourceForgeModel | |||||
@property | |||||
def ADDITIONAL_CONFIG(self): # noqa: N802 | |||||
config = super().ADDITIONAL_CONFIG | |||||
# base url of sourceforge rsync mirror | |||||
config['sf_rsync_mirror_url'] = ('str', _sf_mirror_rsync_baseurl) | |||||
# list of sf projects to skip (those whose a call to sf rest api fails) | |||||
config['sf_projects_to_skip'] = ('list', ['cygwin-ports']) | |||||
return config | |||||
def get_model_from_sf_project_metadata(self, repo): | |||||
model = [] | |||||
# ensure input metadata are valid | |||||
if 'tools' not in repo: | |||||
return model | |||||
# iterate over project services | |||||
for tool in repo['tools']: | |||||
# a version control system is present in the project | |||||
# and is not a link to external service like GitHub | |||||
if tool['mount_point'] == 'code' and tool['name'] != 'link': | |||||
# we need to check that the code repository is not empty first | |||||
resp = requests.get('%s%s' % (self.api_baseurl, | |||||
tool['url'])) | |||||
# code repository is not empty, now retrieve the origin url | |||||
# based on the used vcs tool | |||||
if resp.status_code == 200 and \ | |||||
b'No (more) commits' not in resp.content: | |||||
# bazaar repo special case | |||||
if tool['name'] == 'bzr': | |||||
bzr_url_template = \ | |||||
'bzr://%s.bzr.sourceforge.net/bzrroot/%s' | |||||
origin_url = bzr_url_template %\ | |||||
(repo['shortname'], repo['shortname']) | |||||
# cvs repo special case | |||||
elif tool['name'] == 'cvs': | |||||
cvs_url_template = \ | |||||
'%s.cvs.sourceforge.net:/cvsroot/%s' | |||||
origin_url = cvs_url_template %\ | |||||
(repo['shortname'], repo['shortname']) | |||||
# for hg, git and svn | |||||
else: | |||||
origin_url = 'https://%s.code.sf.net%s' %\ | |||||
(tool['name'], tool['url']) | |||||
# append to model for each code repository found | |||||
model.append({'uid': repo['shortname'], | |||||
'indexable': repo['shortname'], | |||||
'name': repo['shortname'], | |||||
'full_name': repo['name'], | |||||
'html_url': repo['url'], | |||||
'origin_url': origin_url, | |||||
'origin_type': tool['name'], | |||||
'description': repo['short_description'] | |||||
}) | |||||
return model | |||||
def get_next_target_from_response(self, response): | |||||
# special case when the provided min_index does | |||||
# not correspond to a sf project name | |||||
if response.status_code == 404: | |||||
return _next_sf_project( | |||||
self.config['sf_rsync_mirror_url'], | |||||
re.search(r'^.*/(.*)/$', response.url).group(1)) | |||||
body = response.json() | |||||
# ensure current response is valid | |||||
if 'shortname' not in body: | |||||
return None | |||||
# get next sourceforge project name | |||||
next_project = _next_sf_project(self.config['sf_rsync_mirror_url'], | |||||
body['shortname']) | |||||
while next_project in self.config['sf_projects_to_skip']: | |||||
next_project = _next_sf_project(self.config['sf_rsync_mirror_url'], | |||||
next_project) | |||||
return next_project | |||||
def transport_response_simplified(self, response): | |||||
if response.status_code != 200: | |||||
return [] | |||||
else: | |||||
repo = response.json() | |||||
return self.get_model_from_sf_project_metadata(repo) | |||||
def is_within_bounds(self, inner, lower=None, upper=None): | |||||
if lower is None and upper is None: | |||||
return True | |||||
elif lower is None: | |||||
ret = inner <= upper | |||||
elif upper is None: | |||||
ret = inner >= lower | |||||
else: | |||||
ret = lower <= inner <= upper | |||||
return ret |