diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py index c0a7208d..05665ab7 100644 --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,350 +1,350 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from bisect import bisect_right from django.core.exceptions import ObjectDoesNotExist from django.core.exceptions import ValidationError from django.core.validators import URLValidator from swh.web import config from swh.web.common import service from swh.web.common.exc import BadInputExc, ForbiddenExc from swh.web.common.models import ( SaveUnauthorizedOrigin, SaveAuthorizedOrigin, SaveOriginRequest, SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEED, SAVE_TASK_FAILED ) from swh.web.common.utils import get_origin_visits, parse_timestamp from swh.scheduler.utils import create_oneshot_task_dict scheduler = config.scheduler() def get_origin_save_authorized_urls(): """ Get the list of origin url prefixes authorized to be immediately loaded into the archive (whitelist). Returns: list: The list of authorized origin url prefix """ return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] def get_origin_save_unauthorized_urls(): """ Get the list of origin url prefixes forbidden to be loaded into the archive (blacklist). Returns: list: the list of unauthorized origin url prefix """ return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] def can_save_origin(origin_url): """ Check if a software origin can be saved into the archive. Based on the origin url, the save request will be either: * immediately accepted if the url is whitelisted * rejected if the url is blacklisted * put in pending state for manual review otherwise Args: origin_url (str): the software origin url to check Returns: str: the origin save request status, either *accepted*, *rejected* or *pending* """ # origin url may be blacklisted for url_prefix in get_origin_save_unauthorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_REJECTED # if the origin url is in the white list, it can be immediately saved for url_prefix in get_origin_save_authorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_ACCEPTED # otherwise, the origin url needs to be manually verified return SAVE_REQUEST_PENDING # map origin type to scheduler task # TODO: do not hardcode the task name here # TODO: unlock hg and svn loading once the scheduler # loading tasks are available in production _origin_type_task = { 'git': 'origin-update-git', # 'hg': 'origin-load-hg', - # 'svn': 'origin-load-svn' + 'svn': 'origin-load-svn-dump' } # map scheduler task status to origin save status _save_task_status = { 'next_run_not_scheduled': SAVE_TASK_NOT_YET_SCHEDULED, 'next_run_scheduled': SAVE_TASK_SCHEDULED, 'completed': SAVE_TASK_SUCCEED, 'disabled': SAVE_TASK_FAILED } def get_savable_origin_types(): return sorted(list(_origin_type_task.keys())) def _check_origin_type_savable(origin_type): """ Get the list of software origin types that can be loaded through a save request. Returns: list: the list of savable origin types """ allowed_origin_types = ', '.join(get_savable_origin_types()) if origin_type not in _origin_type_task: raise BadInputExc('Origin of type %s can not be saved! ' 'Allowed types are the following: %s' % (origin_type, allowed_origin_types)) _validate_url = URLValidator(schemes=['http', 'https', 'svn', 'git']) def _check_origin_url_valid(origin_url): try: _validate_url(origin_url) except ValidationError: raise BadInputExc('The provided origin url (%s) is not valid!' % origin_url) def _get_visit_date_for_save_request(save_request): visit_date = None try: origin = {'type': save_request.origin_type, 'url': save_request.origin_url} origin_info = service.lookup_origin(origin) origin_visits = get_origin_visits(origin_info) visit_dates = [parse_timestamp(v['date']) for v in origin_visits] i = bisect_right(visit_dates, save_request.request_date) if i != len(visit_dates): save_request.visit_date = visit_dates[i] save_request.save() visit_date = visit_dates[i] except Exception: pass return visit_date def _save_request_dict(save_request, task=None): visit_date = save_request.visit_date if task: save_task_status = _save_task_status[task['status']] if save_task_status in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEED) \ and not visit_date: visit_date = _get_visit_date_for_save_request(save_request) # Ensure last origin visit is available in database # before reporting the task execution as successful if save_task_status == SAVE_TASK_SUCCEED and not visit_date: save_task_status = SAVE_TASK_SCHEDULED save_request.loading_task_status = save_task_status save_request.save() else: save_task_status = save_request.loading_task_status return {'origin_type': save_request.origin_type, 'origin_url': save_request.origin_url, 'save_request_date': save_request.request_date.isoformat(), 'save_request_status': save_request.status, 'save_task_status': save_task_status, 'visit_date': visit_date.isoformat() if visit_date else None} def create_save_origin_request(origin_type, origin_url): """ Create a loading task to save a software origin into the archive. This function aims to create a software origin loading task trough the use of the swh-scheduler component. First, some checks are performed to see if the origin type and url are valid but also if the the save request can be accepted. If those checks passed, the loading task is then created. Otherwise, the save request is put in pending or rejected state. All the submitted save requests are logged into the swh-web database to keep track of them. Args: origin_type (str): the type of origin to save (*git*, *hg*, *svn*, ...) origin_url (str): the url of the origin to save Raises: BadInputExc: the origin type or url is invalid ForbiddenExc: the provided origin url is blacklisted Returns: dict: A dict describing the save request with the following keys: * **origin_type**: the type of the origin to save * **origin_url**: the url of the origin * **save_request_date**: the date the request was submitted * **save_request_status**: the request status, either *accepted*, *rejected* or *pending* * **save_task_status**: the origin loading task status, either *not created*, *not yet scheduled*, *scheduled*, *succeed* or *failed* """ _check_origin_type_savable(origin_type) - _check_origin_url_valid(origin_url) + # _check_origin_url_valid(origin_url) save_request_status = can_save_origin(origin_url) task = None # if the origin save request is accepted, create a scheduler # task to load it into the archive if save_request_status == SAVE_REQUEST_ACCEPTED: # create a task with high priority kwargs = {'priority': 'high'} # set task parameters according to the origin type if origin_type == 'git': kwargs['repo_url'] = origin_url elif origin_type == 'hg': kwargs['origin_url'] = origin_url elif origin_type == 'svn': kwargs['origin_url'] = origin_url kwargs['svn_url'] = origin_url sor = None # get list of previously sumitted save requests current_sors = \ list(SaveOriginRequest.objects.filter(origin_type=origin_type, origin_url=origin_url)) can_create_task = False # if no save requests previously submitted, create the scheduler task if not current_sors: can_create_task = True else: # get the latest submitted save request sor = current_sors[0] # if it was in pending state, we need to create the scheduler task # and update the save request info in the database if sor.status == SAVE_REQUEST_PENDING: can_create_task = True # a task has already been created to load the origin elif sor.loading_task_id != -1: # get the scheduler task and its status tasks = scheduler.get_tasks([sor.loading_task_id]) task = tasks[0] if tasks else None task_status = _save_request_dict(sor, task)['save_task_status'] # create a new scheduler task only if the previous one has been # already executed if task_status == SAVE_TASK_FAILED or \ task_status == SAVE_TASK_SUCCEED: can_create_task = True sor = None else: can_create_task = False if can_create_task: # effectively create the scheduler task task_dict = create_oneshot_task_dict( _origin_type_task[origin_type], **kwargs) task = scheduler.create_tasks([task_dict])[0] # pending save request has been accepted if sor: sor.status = SAVE_REQUEST_ACCEPTED sor.loading_task_id = task['id'] sor.save() else: sor = SaveOriginRequest.objects.create(origin_type=origin_type, origin_url=origin_url, status=save_request_status, # noqa loading_task_id=task['id']) # noqa # save request must be manually reviewed for acceptation elif save_request_status == SAVE_REQUEST_PENDING: # check if there is already such a save request already submitted, # no need to add it to the database in that case try: sor = SaveOriginRequest.objects.get(origin_type=origin_type, origin_url=origin_url, status=save_request_status) # if not add it to the database except ObjectDoesNotExist: sor = SaveOriginRequest.objects.create(origin_type=origin_type, origin_url=origin_url, status=save_request_status) # origin can not be saved as its url is blacklisted, # log the request to the database anyway else: sor = SaveOriginRequest.objects.create(origin_type=origin_type, origin_url=origin_url, status=save_request_status) if save_request_status == SAVE_REQUEST_REJECTED: raise ForbiddenExc('The origin url is blacklisted and will not be ' 'loaded into the archive.') return _save_request_dict(sor, task) def get_save_origin_requests_from_queryset(requests_queryset): """ Get all save requests from a SaveOriginRequest queryset. Args: requests_queryset (django.db.models.QuerySet): input SaveOriginRequest queryset Returns: list: A list of save origin requests dict as described in :func:`swh.web.common.origin_save.create_save_origin_request` """ requests = [] for sor in requests_queryset: task = None # save task has not been created, no need to query # the scheduler API if sor.loading_task_id != -1: tasks = scheduler.get_tasks([sor.loading_task_id]) task = tasks[0] if tasks else None requests.append(_save_request_dict(sor, task)) return requests def get_save_origin_requests(origin_type, origin_url): """ Get all save requests for a given software origin. Args: origin_type (str): the type of the origin origin_url (str): the url of the origin Raises: BadInputExc: the origin type or url is invalid Returns: list: A list of save origin requests dict as described in :func:`swh.web.common.origin_save.create_save_origin_request` """ _check_origin_type_savable(origin_type) _check_origin_url_valid(origin_url) sors = SaveOriginRequest.objects.filter(origin_type=origin_type, origin_url=origin_url) return get_save_origin_requests_from_queryset(sors) diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py index 354c603c..d0dc3f43 100644 --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -1,388 +1,392 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import docutils.parsers.rst import docutils.utils import re import requests from datetime import datetime, timezone from dateutil import parser as date_parser from dateutil import tz from django.core.cache import cache from django.core import urlresolvers from django.http import QueryDict from swh.model.exceptions import ValidationError from swh.model.identifiers import ( persistent_identifier, parse_persistent_identifier, CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT ) from swh.web.common import service from swh.web.common.exc import BadInputExc from swh.web.config import get_config def reverse(viewname, args=None, kwargs=None, query_params=None, current_app=None, urlconf=None): """An override of django reverse function supporting query parameters. Args: viewname: the name of the django view from which to compute a url args: list of url arguments ordered according to their position it kwargs: dictionary of url arguments indexed by their names query_params: dictionary of query parameters to append to the reversed url current_app: the name of the django app tighted to the view urlconf: url configuration module Returns: str: the url of the requested view with processed arguments and query parameters """ if kwargs: kwargs = {k: v for k, v in kwargs.items() if v is not None} url = urlresolvers.reverse( viewname, urlconf=urlconf, args=args, kwargs=kwargs, current_app=current_app) if query_params: query_params = {k: v for k, v in query_params.items() if v is not None} if query_params and len(query_params) > 0: query_dict = QueryDict('', mutable=True) for k in sorted(query_params.keys()): query_dict[k] = query_params[k] url += ('?' + query_dict.urlencode(safe='/;:')) return url def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datetime: datetime in UTC without timezone info """ if date.tzinfo: return date.astimezone(tz.gettz('UTC')).replace(tzinfo=timezone.utc) else: return date def parse_timestamp(timestamp): """Given a time or timestamp (as string), parse the result as UTC datetime. Returns: datetime.datetime: a timezone-aware datetime representing the parsed value or None if the parsing fails. Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - Today is January 1, 2047 at 8:21:00AM - 1452591542 """ if not timestamp: return None try: date = date_parser.parse(timestamp, ignoretz=False, fuzzy=True) return datetime_to_utc(date) except Exception: try: return datetime.utcfromtimestamp(float(timestamp)).replace( tzinfo=timezone.utc) except (ValueError, OverflowError) as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r'([0-9a-f]{8})[0-9a-z]{56}' sha1_re = r'([0-9a-f]{8})[0-9a-f]{32}' ret = re.sub(sha256_re, r'\1...', path) return re.sub(sha1_re, r'\1...', ret) def format_utc_iso_date(iso_date, fmt='%d %B %Y, %H:%M UTC'): """Turns a string reprensation of an ISO 8601 date string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: str: a formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_timestamp(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: list: a list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip('/').split('/') path_from_root = '' for p in sub_paths: path_from_root += '/' + p path_info.append({'name': p, 'path': path_from_root.strip('/')}) return path_info def get_origin_visits(origin_info): """Function that returns the list of visits for a swh origin. That list is put in cache in order to speedup the navigation in the swh web browse ui. Args: origin_id (int): the id of the swh origin to fetch visits from Returns: list: A list of dict describing the origin visits with the following keys: * **date**: UTC visit date in ISO format, * **origin**: the origin id * **status**: the visit status, either *full* or *partial* * **visit**: the visit id Raises: NotFoundExc: if the origin is not found """ cache_entry_id = 'origin_%s_visits' % origin_info['id'] cache_entry = cache.get(cache_entry_id) - if cache_entry: + last_snapshot = service.lookup_latest_origin_snapshot(origin_info['id']) + + if cache_entry and \ + (not last_snapshot or + last_snapshot['id'] == cache_entry[-1]['snapshot']): return cache_entry origin_visits = [] per_page = service.MAX_LIMIT last_visit = None while 1: visits = list(service.lookup_origin_visits(origin_info['id'], last_visit=last_visit, per_page=per_page)) origin_visits += visits if len(visits) < per_page: break else: if not last_visit: last_visit = per_page else: last_visit += per_page def _visit_sort_key(visit): ts = parse_timestamp(visit['date']).timestamp() return ts + (float(visit['visit']) / 10e3) for v in origin_visits: if 'metadata' in v: del v['metadata'] origin_visits = [dict(t) for t in set([tuple(d.items()) for d in origin_visits])] origin_visits = sorted(origin_visits, key=lambda v: _visit_sort_key(v)) cache.set(cache_entry_id, origin_visits) return origin_visits def get_swh_persistent_id(object_type, object_id, scheme_version=1): """ Returns the persistent identifier for a swh object based on: * the object type * the object id * the swh identifiers scheme version Args: object_type (str): the swh object type (content/directory/release/revision/snapshot) object_id (str): the swh object id (hexadecimal representation of its hash value) scheme_version (int): the scheme version of the swh persistent identifiers Returns: str: the swh object persistent identifier Raises: BadInputExc: if the provided parameters do not enable to generate a valid identifier """ try: swh_id = persistent_identifier(object_type, object_id, scheme_version) except ValidationError as e: raise BadInputExc('Invalid object (%s) for swh persistent id. %s' % (object_id, e)) else: return swh_id def resolve_swh_persistent_id(swh_id, query_params=None): """ Try to resolve a SWH persistent id into an url for browsing the pointed object. Args: swh_id (str): a SWH persistent identifier query_params (django.http.QueryDict): optional dict filled with query parameters to append to the browse url Returns: dict: a dict with the following keys: * **swh_id_parsed (swh.model.identifiers.PersistentId)**: the parsed identifier * **browse_url (str)**: the url for browsing the pointed object Raises: BadInputExc: if the provided identifier can not be parsed """ # noqa try: swh_id_parsed = parse_persistent_identifier(swh_id) object_type = swh_id_parsed.object_type object_id = swh_id_parsed.object_id browse_url = None query_dict = QueryDict('', mutable=True) if query_params and len(query_params) > 0: for k in sorted(query_params.keys()): query_dict[k] = query_params[k] if 'origin' in swh_id_parsed.metadata: query_dict['origin'] = swh_id_parsed.metadata['origin'] if object_type == CONTENT: query_string = 'sha1_git:' + object_id fragment = '' if 'lines' in swh_id_parsed.metadata: lines = swh_id_parsed.metadata['lines'].split('-') fragment += '#L' + lines[0] if len(lines) > 1: fragment += '-L' + lines[1] browse_url = reverse('browse-content', kwargs={'query_string': query_string}, query_params=query_dict) + fragment elif object_type == DIRECTORY: browse_url = reverse('browse-directory', kwargs={'sha1_git': object_id}, query_params=query_dict) elif object_type == RELEASE: browse_url = reverse('browse-release', kwargs={'sha1_git': object_id}, query_params=query_dict) elif object_type == REVISION: browse_url = reverse('browse-revision', kwargs={'sha1_git': object_id}, query_params=query_dict) elif object_type == SNAPSHOT: browse_url = reverse('browse-snapshot', kwargs={'snapshot_id': object_id}, query_params=query_dict) except ValidationError as ve: raise BadInputExc('Error when parsing identifier. %s' % ' '.join(ve.messages)) else: return {'swh_id_parsed': swh_id_parsed, 'browse_url': browse_url} def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components).get_default_values() settings.report_level = report_level document = docutils.utils.new_document('rst-doc', settings=settings) parser.parse(text, document) return document def get_client_ip(request): """ Return the client IP address from an incoming HTTP request. Args: request (django.http.HttpRequest): the incoming HTTP request Returns: str: The client IP address """ x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR') if x_forwarded_for: ip = x_forwarded_for.split(',')[0] else: ip = request.META.get('REMOTE_ADDR') return ip def is_recaptcha_valid(request, recaptcha_response): """ Verify if the response for Google reCAPTCHA is valid. Args: request (django.http.HttpRequest): the incoming HTTP request recaptcha_response (str): the reCAPTCHA response Returns: bool: Wether the reCAPTCHA response is valid or not """ config = get_config() return requests.post( config['grecaptcha']['validation_url'], data={ 'secret': config['grecaptcha']['private_key'], 'response': recaptcha_response, 'remoteip': get_client_ip(request) }, verify=True ).json().get("success", False)