Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11023683
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
View Options
diff --git a/mypy.ini b/mypy.ini
index e4ae5aea..7ecdbae2 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,52 +1,55 @@
[mypy]
namespace_packages = True
warn_unused_ignores = True
# support for django magic: https://github.com/typeddjango/django-stubs
plugins = mypy_django_plugin.main
[mypy.plugins.django-stubs]
django_settings_module = swh.web.settings.development
# 3rd party libraries without stubs (yet)
[mypy-bs4.*]
ignore_missing_imports = True
[mypy-corsheaders.*]
ignore_missing_imports = True
[mypy-django_js_reverse.*]
ignore_missing_imports = True
[mypy-htmlmin.*]
ignore_missing_imports = True
[mypy-magic.*]
ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
+[mypy-prometheus_client.*]
+ignore_missing_imports = True
+
[mypy-pygments.*]
ignore_missing_imports = True
[mypy-pypandoc.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[mypy-rest_framework.*]
ignore_missing_imports = True
[mypy-requests_mock.*]
ignore_missing_imports = True
[mypy-sphinx.*]
ignore_missing_imports = True
[mypy-sphinxcontrib.*]
ignore_missing_imports = True
[mypy-swh.docs.*]
ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
index 364e6201..626528b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,29 +1,30 @@
# Add here external Python modules dependencies, one per line. Module names
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
# Runtime dependencies
beautifulsoup4
Django >= 1.11.0, < 2.0
django-cors-headers
djangorestframework >= 3.4.0
django_webpack_loader
django_js_reverse
docutils
python-magic >= 0.4.0
htmlmin
lxml
+prometheus_client
pygments
pypandoc
python-dateutil
pyyaml
requests
python-memcached
pybadges
sentry-sdk
# Doc dependencies
sphinx
sphinxcontrib-httpdomain
diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py
index 56c8d70c..c938c750 100644
--- a/swh/web/common/origin_save.py
+++ b/swh/web/common/origin_save.py
@@ -1,529 +1,584 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from bisect import bisect_right
from datetime import datetime, timezone, timedelta
+from itertools import product
import json
import logging
from django.core.exceptions import ObjectDoesNotExist
from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from django.utils.html import escape
+from prometheus_client import Gauge
+
import requests
import sentry_sdk
from swh.web import config
from swh.web.common import service
from swh.web.common.exc import BadInputExc, ForbiddenExc, NotFoundExc
from swh.web.common.models import (
SaveUnauthorizedOrigin, SaveAuthorizedOrigin, SaveOriginRequest,
SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING,
SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED,
- SAVE_TASK_SUCCEED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING
+ SAVE_TASK_SUCCEED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING,
+ SAVE_TASK_NOT_CREATED
)
from swh.web.common.origin_visits import get_origin_visits
-from swh.web.common.utils import parse_timestamp
+from swh.web.common.utils import parse_timestamp, SWH_WEB_METRICS_REGISTRY
from swh.scheduler.utils import create_oneshot_task_dict
scheduler = config.scheduler()
logger = logging.getLogger(__name__)
def get_origin_save_authorized_urls():
"""
Get the list of origin url prefixes authorized to be
immediately loaded into the archive (whitelist).
Returns:
list: The list of authorized origin url prefix
"""
return [origin.url
for origin in SaveAuthorizedOrigin.objects.all()]
def get_origin_save_unauthorized_urls():
"""
Get the list of origin url prefixes forbidden to be
loaded into the archive (blacklist).
Returns:
list: the list of unauthorized origin url prefix
"""
return [origin.url
for origin in SaveUnauthorizedOrigin.objects.all()]
def can_save_origin(origin_url):
"""
Check if a software origin can be saved into the archive.
Based on the origin url, the save request will be either:
* immediately accepted if the url is whitelisted
* rejected if the url is blacklisted
* put in pending state for manual review otherwise
Args:
origin_url (str): the software origin url to check
Returns:
str: the origin save request status, either **accepted**,
**rejected** or **pending**
"""
# origin url may be blacklisted
for url_prefix in get_origin_save_unauthorized_urls():
if origin_url.startswith(url_prefix):
return SAVE_REQUEST_REJECTED
# if the origin url is in the white list, it can be immediately saved
for url_prefix in get_origin_save_authorized_urls():
if origin_url.startswith(url_prefix):
return SAVE_REQUEST_ACCEPTED
# otherwise, the origin url needs to be manually verified
return SAVE_REQUEST_PENDING
# map visit type to scheduler task
# TODO: do not hardcode the task name here (T1157)
_visit_type_task = {
'git': 'load-git',
'hg': 'load-hg',
'svn': 'load-svn'
}
# map scheduler task status to origin save status
_save_task_status = {
'next_run_not_scheduled': SAVE_TASK_NOT_YET_SCHEDULED,
'next_run_scheduled': SAVE_TASK_SCHEDULED,
'completed': SAVE_TASK_SUCCEED,
'disabled': SAVE_TASK_FAILED
}
def get_savable_visit_types():
return sorted(list(_visit_type_task.keys()))
def _check_visit_type_savable(visit_type):
"""
Get the list of visit types that can be performed
through a save request.
Returns:
list: the list of saveable visit types
"""
allowed_visit_types = ', '.join(get_savable_visit_types())
if visit_type not in _visit_type_task:
raise BadInputExc('Visit of type %s can not be saved! '
'Allowed types are the following: %s' %
(visit_type, allowed_visit_types))
_validate_url = URLValidator(schemes=['http', 'https', 'svn', 'git'])
def _check_origin_url_valid(origin_url):
try:
_validate_url(origin_url)
except ValidationError:
raise BadInputExc('The provided origin url (%s) is not valid!' %
escape(origin_url))
def _get_visit_info_for_save_request(save_request):
visit_date = None
visit_status = None
try:
origin = {'url': save_request.origin_url}
origin_info = service.lookup_origin(origin)
origin_visits = get_origin_visits(origin_info)
visit_dates = [parse_timestamp(v['date'])
for v in origin_visits]
i = bisect_right(visit_dates, save_request.request_date)
if i != len(visit_dates):
visit_date = visit_dates[i]
visit_status = origin_visits[i]['status']
if origin_visits[i]['status'] == 'ongoing':
visit_date = None
except Exception as exc:
sentry_sdk.capture_exception(exc)
return visit_date, visit_status
def _check_visit_update_status(save_request, save_task_status):
visit_date, visit_status = _get_visit_info_for_save_request(save_request)
save_request.visit_date = visit_date
# visit has been performed, mark the saving task as succeed
if visit_date and visit_status is not None:
save_task_status = SAVE_TASK_SUCCEED
elif visit_status == 'ongoing':
save_task_status = SAVE_TASK_RUNNING
else:
time_now = datetime.now(tz=timezone.utc)
time_delta = time_now - save_request.request_date
# consider the task as failed if it is still in scheduled state
# 30 days after its submission
if time_delta.days > 30:
save_task_status = SAVE_TASK_FAILED
return visit_date, save_task_status
def _save_request_dict(save_request, task=None):
must_save = False
visit_date = save_request.visit_date
# save task still in scheduler db
if task:
save_task_status = _save_task_status[task['status']]
# Consider request from which a visit date has already been found
# as succeeded to avoid retrieving it again
if save_task_status == SAVE_TASK_SCHEDULED and visit_date:
save_task_status = SAVE_TASK_SUCCEED
if save_task_status in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEED) \
and not visit_date:
visit_date, _ = _get_visit_info_for_save_request(save_request)
save_request.visit_date = visit_date
must_save = True
# Check tasks still marked as scheduled / not yet scheduled
if save_task_status in (SAVE_TASK_SCHEDULED,
SAVE_TASK_NOT_YET_SCHEDULED):
visit_date, save_task_status = _check_visit_update_status(
save_request, save_task_status)
# save task may have been archived
else:
save_task_status = save_request.loading_task_status
if save_task_status in (SAVE_TASK_SCHEDULED,
SAVE_TASK_NOT_YET_SCHEDULED):
visit_date, save_task_status = _check_visit_update_status(
save_request, save_task_status)
else:
save_task_status = save_request.loading_task_status
if save_request.loading_task_status != save_task_status:
save_request.loading_task_status = save_task_status
must_save = True
if must_save:
save_request.save()
return {'id': save_request.id,
'visit_type': save_request.visit_type,
'origin_url': save_request.origin_url,
'save_request_date': save_request.request_date.isoformat(),
'save_request_status': save_request.status,
'save_task_status': save_task_status,
'visit_date': visit_date.isoformat() if visit_date else None}
def create_save_origin_request(visit_type, origin_url):
"""
Create a loading task to save a software origin into the archive.
This function aims to create a software origin loading task
trough the use of the swh-scheduler component.
First, some checks are performed to see if the visit type and origin
url are valid but also if the the save request can be accepted.
If those checks passed, the loading task is then created.
Otherwise, the save request is put in pending or rejected state.
All the submitted save requests are logged into the swh-web
database to keep track of them.
Args:
visit_type (str): the type of visit to perform (currently only
``git`` but ``svn`` and ``hg`` will soon be available)
origin_url (str): the url of the origin to save
Raises:
BadInputExc: the visit type or origin url is invalid
ForbiddenExc: the provided origin url is blacklisted
Returns:
dict: A dict describing the save request with the following keys:
* **visit_type**: the type of visit to perform
* **origin_url**: the url of the origin
* **save_request_date**: the date the request was submitted
* **save_request_status**: the request status, either **accepted**,
**rejected** or **pending**
* **save_task_status**: the origin loading task status, either
**not created**, **not yet scheduled**, **scheduled**,
**succeed** or **failed**
"""
_check_visit_type_savable(visit_type)
_check_origin_url_valid(origin_url)
save_request_status = can_save_origin(origin_url)
task = None
# if the origin save request is accepted, create a scheduler
# task to load it into the archive
if save_request_status == SAVE_REQUEST_ACCEPTED:
# create a task with high priority
kwargs = {
'priority': 'high',
'url': origin_url,
}
sor = None
# get list of previously sumitted save requests
current_sors = \
list(SaveOriginRequest.objects.filter(visit_type=visit_type,
origin_url=origin_url))
can_create_task = False
# if no save requests previously submitted, create the scheduler task
if not current_sors:
can_create_task = True
else:
# get the latest submitted save request
sor = current_sors[0]
# if it was in pending state, we need to create the scheduler task
# and update the save request info in the database
if sor.status == SAVE_REQUEST_PENDING:
can_create_task = True
# a task has already been created to load the origin
elif sor.loading_task_id != -1:
# get the scheduler task and its status
tasks = scheduler.get_tasks([sor.loading_task_id])
task = tasks[0] if tasks else None
task_status = _save_request_dict(sor, task)['save_task_status']
# create a new scheduler task only if the previous one has been
# already executed
if task_status == SAVE_TASK_FAILED or \
task_status == SAVE_TASK_SUCCEED:
can_create_task = True
sor = None
else:
can_create_task = False
if can_create_task:
# effectively create the scheduler task
task_dict = create_oneshot_task_dict(
_visit_type_task[visit_type], **kwargs)
task = scheduler.create_tasks([task_dict])[0]
# pending save request has been accepted
if sor:
sor.status = SAVE_REQUEST_ACCEPTED
sor.loading_task_id = task['id']
sor.save()
else:
sor = SaveOriginRequest.objects.create(visit_type=visit_type,
origin_url=origin_url,
status=save_request_status, # noqa
loading_task_id=task['id']) # noqa
# save request must be manually reviewed for acceptation
elif save_request_status == SAVE_REQUEST_PENDING:
# check if there is already such a save request already submitted,
# no need to add it to the database in that case
try:
sor = SaveOriginRequest.objects.get(visit_type=visit_type,
origin_url=origin_url,
status=save_request_status)
# if not add it to the database
except ObjectDoesNotExist:
sor = SaveOriginRequest.objects.create(visit_type=visit_type,
origin_url=origin_url,
status=save_request_status)
# origin can not be saved as its url is blacklisted,
# log the request to the database anyway
else:
sor = SaveOriginRequest.objects.create(visit_type=visit_type,
origin_url=origin_url,
status=save_request_status)
if save_request_status == SAVE_REQUEST_REJECTED:
raise ForbiddenExc(('The "save code now" request has been rejected '
'because the provided origin url is blacklisted.'))
return _save_request_dict(sor, task)
def get_save_origin_requests_from_queryset(requests_queryset):
"""
Get all save requests from a SaveOriginRequest queryset.
Args:
requests_queryset (django.db.models.QuerySet): input
SaveOriginRequest queryset
Returns:
list: A list of save origin requests dict as described in
:func:`swh.web.common.origin_save.create_save_origin_request`
"""
task_ids = []
for sor in requests_queryset:
task_ids.append(sor.loading_task_id)
save_requests = []
if task_ids:
tasks = scheduler.get_tasks(task_ids)
tasks = {task['id']: task for task in tasks}
for sor in requests_queryset:
sr_dict = _save_request_dict(sor, tasks.get(sor.loading_task_id))
save_requests.append(sr_dict)
return save_requests
def get_save_origin_requests(visit_type, origin_url):
"""
Get all save requests for a given software origin.
Args:
visit_type (str): the type of visit
origin_url (str): the url of the origin
Raises:
BadInputExc: the visit type or origin url is invalid
NotFoundExc: no save requests can be found for the given origin
Returns:
list: A list of save origin requests dict as described in
:func:`swh.web.common.origin_save.create_save_origin_request`
"""
_check_visit_type_savable(visit_type)
_check_origin_url_valid(origin_url)
sors = SaveOriginRequest.objects.filter(visit_type=visit_type,
origin_url=origin_url)
if sors.count() == 0:
raise NotFoundExc(('No save requests found for visit of type '
'%s on origin with url %s.')
% (visit_type, origin_url))
return get_save_origin_requests_from_queryset(sors)
def get_save_origin_task_info(save_request_id):
"""
Get detailed information about an accepted save origin request
and its associated loading task.
If the associated loading task info is archived and removed
from the scheduler database, returns an empty dictionary.
Args:
save_request_id (int): identifier of a save origin request
Returns:
dict: A dictionary with the following keys:
- **type**: loading task type
- **arguments**: loading task arguments
- **id**: loading task database identifier
- **backend_id**: loading task celery identifier
- **scheduled**: loading task scheduling date
- **ended**: loading task termination date
- **status**: loading task execution status
Depending on the availability of the task logs in the elasticsearch
cluster of Software Heritage, the returned dictionary may also
contain the following keys:
- **name**: associated celery task name
- **message**: relevant log message from task execution
- **duration**: task execution time (only if it succeeded)
- **worker**: name of the worker that executed the task
"""
try:
save_request = SaveOriginRequest.objects.get(id=save_request_id)
except ObjectDoesNotExist:
return {}
task = scheduler.get_tasks([save_request.loading_task_id])
task = task[0] if task else None
if task is None:
return {}
task_run = scheduler.get_task_runs([task['id']])
task_run = task_run[0] if task_run else None
if task_run is None:
return {}
task_run['type'] = task['type']
task_run['arguments'] = task['arguments']
task_run['id'] = task_run['task']
del task_run['task']
del task_run['metadata']
del task_run['started']
es_workers_index_url = config.get_config()['es_workers_index_url']
if not es_workers_index_url:
return task_run
es_workers_index_url += '/_search'
if save_request.visit_date:
min_ts = save_request.visit_date
max_ts = min_ts + timedelta(days=7)
else:
min_ts = save_request.request_date
max_ts = min_ts + timedelta(days=30)
min_ts = int(min_ts.timestamp()) * 1000
max_ts = int(max_ts.timestamp()) * 1000
save_task_status = _save_task_status[task['status']]
priority = '3' if save_task_status == SAVE_TASK_FAILED else '6'
query = {
'bool': {
'must': [
{
'match_phrase': {
'priority': {
'query': priority
}
}
},
{
'match_phrase': {
'swh_task_id': {
'query': task_run['backend_id']
}
}
},
{
'range': {
'@timestamp': {
'gte': min_ts,
'lte': max_ts,
'format': 'epoch_millis'
}
}
}
]
}
}
try:
response = requests.post(es_workers_index_url,
json={'query': query,
'sort': ['@timestamp']},
timeout=30)
results = json.loads(response.text)
if results['hits']['total'] >= 1:
task_run_info = results['hits']['hits'][-1]['_source']
if 'swh_logging_args_runtime' in task_run_info:
duration = task_run_info['swh_logging_args_runtime']
task_run['duration'] = duration
if 'message' in task_run_info:
task_run['message'] = task_run_info['message']
if 'swh_logging_args_name' in task_run_info:
task_run['name'] = task_run_info['swh_logging_args_name']
elif 'swh_task_name' in task_run_info:
task_run['name'] = task_run_info['swh_task_name']
if 'hostname' in task_run_info:
task_run['worker'] = task_run_info['hostname']
elif 'host' in task_run_info:
task_run['worker'] = task_run_info['host']
except Exception as exc:
logger.warning('Request to Elasticsearch failed\n%s', exc)
sentry_sdk.capture_exception(exc)
return task_run
+
+
+SUBMITTED_SAVE_REQUESTS_METRIC = 'swh_web_submitted_save_requests'
+
+_submitted_save_requests_gauge = Gauge(
+ name=SUBMITTED_SAVE_REQUESTS_METRIC,
+ documentation='Number of submitted origin save requests',
+ labelnames=['status', 'visit_type'],
+ registry=SWH_WEB_METRICS_REGISTRY)
+
+
+ACCEPTED_SAVE_REQUESTS_METRIC = 'swh_web_accepted_save_requests'
+
+_accepted_save_requests_gauge = Gauge(
+ name=ACCEPTED_SAVE_REQUESTS_METRIC,
+ documentation='Number of accepted origin save requests',
+ labelnames=['load_task_status', 'visit_type'],
+ registry=SWH_WEB_METRICS_REGISTRY)
+
+
+def compute_save_requests_metrics():
+ """Compute a couple of Prometheus metrics related to
+ origin save requests"""
+
+ request_statuses = (SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED,
+ SAVE_REQUEST_PENDING)
+
+ load_task_statuses = (SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED,
+ SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEED,
+ SAVE_TASK_FAILED, SAVE_TASK_RUNNING)
+
+ visit_types = get_savable_visit_types()
+
+ labels_set = product(request_statuses, visit_types)
+
+ for labels in labels_set:
+ _submitted_save_requests_gauge.labels(*labels).set(0)
+
+ labels_set = product(load_task_statuses, visit_types)
+
+ for labels in labels_set:
+ _accepted_save_requests_gauge.labels(*labels).set(0)
+
+ for sor in SaveOriginRequest.objects.all():
+ if sor.status == SAVE_REQUEST_ACCEPTED:
+ _accepted_save_requests_gauge.labels(
+ load_task_status=sor.loading_task_status,
+ visit_type=sor.visit_type).inc()
+
+ _submitted_save_requests_gauge.labels(
+ status=sor.status, visit_type=sor.visit_type).inc()
diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py
index b6094e37..d0b5ed64 100644
--- a/swh/web/common/utils.py
+++ b/swh/web/common/utils.py
@@ -1,338 +1,342 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import docutils.parsers.rst
import docutils.utils
import re
from datetime import datetime, timezone
from dateutil import parser as date_parser
from dateutil import tz
from django.urls import reverse as django_reverse
from django.http import QueryDict
+from prometheus_client.registry import CollectorRegistry
+
from rest_framework.authentication import SessionAuthentication
from swh.model.exceptions import ValidationError
from swh.model.identifiers import (
persistent_identifier, parse_persistent_identifier,
CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT
)
from swh.web.common.exc import BadInputExc
+SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True)
+
swh_object_icons = {
'branch': 'fa fa-code-fork',
'branches': 'fa fa-code-fork',
'content': 'fa fa-file-text',
'directory': 'fa fa-folder',
'person': 'fa fa-user',
'revisions history': 'fa fa-history',
'release': 'fa fa-tag',
'releases': 'fa fa-tag',
'revision': 'octicon-git-commit',
'snapshot': 'fa fa-camera',
'visits': 'fa fa-calendar',
}
def reverse(viewname, url_args=None, query_params=None,
current_app=None, urlconf=None):
"""An override of django reverse function supporting query parameters.
Args:
viewname (str): the name of the django view from which to compute a url
url_args (dict): dictionary of url arguments indexed by their names
query_params (dict): dictionary of query parameters to append to the
reversed url
current_app (str): the name of the django app tighten to the view
urlconf (str): url configuration module
Returns:
str: the url of the requested view with processed arguments and
query parameters
"""
if url_args:
url_args = {k: v for k, v in url_args.items() if v is not None}
url = django_reverse(viewname, urlconf=urlconf, kwargs=url_args,
current_app=current_app)
if query_params:
query_params = {k: v for k, v in query_params.items() if v}
if query_params and len(query_params) > 0:
query_dict = QueryDict('', mutable=True)
for k in sorted(query_params.keys()):
query_dict[k] = query_params[k]
url += ('?' + query_dict.urlencode(safe='/;:'))
return url
def datetime_to_utc(date):
"""Returns datetime in UTC without timezone info
Args:
date (datetime.datetime): input datetime with timezone info
Returns:
datetime.datetime: datetime in UTC without timezone info
"""
if date.tzinfo:
return date.astimezone(tz.gettz('UTC')).replace(tzinfo=timezone.utc)
else:
return date
def parse_timestamp(timestamp):
"""Given a time or timestamp (as string), parse the result as UTC datetime.
Returns:
datetime.datetime: a timezone-aware datetime representing the
parsed value or None if the parsing fails.
Samples:
- 2016-01-12
- 2016-01-12T09:19:12+0100
- Today is January 1, 2047 at 8:21:00AM
- 1452591542
"""
if not timestamp:
return None
try:
date = date_parser.parse(timestamp, ignoretz=False, fuzzy=True)
return datetime_to_utc(date)
except Exception:
try:
return datetime.utcfromtimestamp(float(timestamp)).replace(
tzinfo=timezone.utc)
except (ValueError, OverflowError) as e:
raise BadInputExc(e)
def shorten_path(path):
"""Shorten the given path: for each hash present, only return the first
8 characters followed by an ellipsis"""
sha256_re = r'([0-9a-f]{8})[0-9a-z]{56}'
sha1_re = r'([0-9a-f]{8})[0-9a-f]{32}'
ret = re.sub(sha256_re, r'\1...', path)
return re.sub(sha1_re, r'\1...', ret)
def format_utc_iso_date(iso_date, fmt='%d %B %Y, %H:%M UTC'):
"""Turns a string representation of an ISO 8601 date string
to UTC and format it into a more human readable one.
For instance, from the following input
string: '2017-05-04T13:27:13+02:00' the following one
is returned: '04 May 2017, 11:27 UTC'.
Custom format string may also be provided
as parameter
Args:
iso_date (str): a string representation of an ISO 8601 date
fmt (str): optional date formatting string
Returns:
str: a formatted string representation of the input iso date
"""
if not iso_date:
return iso_date
date = parse_timestamp(iso_date)
return date.strftime(fmt)
def gen_path_info(path):
"""Function to generate path data navigation for use
with a breadcrumb in the swh web ui.
For instance, from a path /folder1/folder2/folder3,
it returns the following list::
[{'name': 'folder1', 'path': 'folder1'},
{'name': 'folder2', 'path': 'folder1/folder2'},
{'name': 'folder3', 'path': 'folder1/folder2/folder3'}]
Args:
path: a filesystem path
Returns:
list: a list of path data for navigation as illustrated above.
"""
path_info = []
if path:
sub_paths = path.strip('/').split('/')
path_from_root = ''
for p in sub_paths:
path_from_root += '/' + p
path_info.append({'name': p,
'path': path_from_root.strip('/')})
return path_info
def get_swh_persistent_id(object_type, object_id, scheme_version=1):
"""
Returns the persistent identifier for a swh object based on:
* the object type
* the object id
* the swh identifiers scheme version
Args:
object_type (str): the swh object type
(content/directory/release/revision/snapshot)
object_id (str): the swh object id (hexadecimal representation
of its hash value)
scheme_version (int): the scheme version of the swh
persistent identifiers
Returns:
str: the swh object persistent identifier
Raises:
BadInputExc: if the provided parameters do not enable to
generate a valid identifier
"""
try:
swh_id = persistent_identifier(object_type, object_id, scheme_version)
except ValidationError as e:
raise BadInputExc('Invalid object (%s) for swh persistent id. %s' %
(object_id, e))
else:
return swh_id
def resolve_swh_persistent_id(swh_id, query_params=None):
"""
Try to resolve a Software Heritage persistent id into an url for
browsing the pointed object.
Args:
swh_id (str): a Software Heritage persistent identifier
query_params (django.http.QueryDict): optional dict filled with
query parameters to append to the browse url
Returns:
dict: a dict with the following keys:
* **swh_id_parsed (swh.model.identifiers.PersistentId)**:
the parsed identifier
* **browse_url (str)**: the url for browsing the pointed object
Raises:
BadInputExc: if the provided identifier can not be parsed
"""
try:
swh_id_parsed = parse_persistent_identifier(swh_id)
object_type = swh_id_parsed.object_type
object_id = swh_id_parsed.object_id
browse_url = None
query_dict = QueryDict('', mutable=True)
if query_params and len(query_params) > 0:
for k in sorted(query_params.keys()):
query_dict[k] = query_params[k]
if 'origin' in swh_id_parsed.metadata:
query_dict['origin'] = swh_id_parsed.metadata['origin']
if object_type == CONTENT:
query_string = 'sha1_git:' + object_id
fragment = ''
if 'lines' in swh_id_parsed.metadata:
lines = swh_id_parsed.metadata['lines'].split('-')
fragment += '#L' + lines[0]
if len(lines) > 1:
fragment += '-L' + lines[1]
browse_url = reverse('browse-content',
url_args={'query_string': query_string},
query_params=query_dict) + fragment
elif object_type == DIRECTORY:
browse_url = reverse('browse-directory',
url_args={'sha1_git': object_id},
query_params=query_dict)
elif object_type == RELEASE:
browse_url = reverse('browse-release',
url_args={'sha1_git': object_id},
query_params=query_dict)
elif object_type == REVISION:
browse_url = reverse('browse-revision',
url_args={'sha1_git': object_id},
query_params=query_dict)
elif object_type == SNAPSHOT:
browse_url = reverse('browse-snapshot',
url_args={'snapshot_id': object_id},
query_params=query_dict)
except ValidationError as ve:
raise BadInputExc('Error when parsing identifier. %s' %
' '.join(ve.messages))
else:
return {'swh_id_parsed': swh_id_parsed,
'browse_url': browse_url}
def parse_rst(text, report_level=2):
"""
Parse a reStructuredText string with docutils.
Args:
text (str): string with reStructuredText markups in it
report_level (int): level of docutils report messages to print
(1 info 2 warning 3 error 4 severe 5 none)
Returns:
docutils.nodes.document: a parsed docutils document
"""
parser = docutils.parsers.rst.Parser()
components = (docutils.parsers.rst.Parser,)
settings = docutils.frontend.OptionParser(
components=components).get_default_values()
settings.report_level = report_level
document = docutils.utils.new_document('rst-doc', settings=settings)
parser.parse(text, document)
return document
def get_client_ip(request):
"""
Return the client IP address from an incoming HTTP request.
Args:
request (django.http.HttpRequest): the incoming HTTP request
Returns:
str: The client IP address
"""
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
if x_forwarded_for:
ip = x_forwarded_for.split(',')[0]
else:
ip = request.META.get('REMOTE_ADDR')
return ip
def context_processor(request):
"""
Django context processor used to inject variables
in all swh-web templates.
"""
return {'swh_object_icons': swh_object_icons,
'available_languages': None}
class EnforceCSRFAuthentication(SessionAuthentication):
"""
Helper class to enforce CSRF validation on a DRF view
when a user is not authenticated.
"""
def authenticate(self, request):
user = getattr(request._request, 'user', None)
self.enforce_csrf(request)
return (user, None)
diff --git a/swh/web/misc/metrics.py b/swh/web/misc/metrics.py
new file mode 100644
index 00000000..a7f5c7d1
--- /dev/null
+++ b/swh/web/misc/metrics.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from django.http import HttpResponse
+
+from prometheus_client.exposition import generate_latest, CONTENT_TYPE_LATEST
+from swh.web.common.origin_save import compute_save_requests_metrics
+from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY
+
+
+def prometheus_metrics(request):
+
+ compute_save_requests_metrics()
+
+ return HttpResponse(
+ content=generate_latest(registry=SWH_WEB_METRICS_REGISTRY),
+ content_type=CONTENT_TYPE_LATEST)
diff --git a/swh/web/misc/urls.py b/swh/web/misc/urls.py
index d2e6112a..82deb6d6 100644
--- a/swh/web/misc/urls.py
+++ b/swh/web/misc/urls.py
@@ -1,80 +1,83 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import requests
import sentry_sdk
from django.conf.urls import url, include
from django.contrib.staticfiles import finders
from django.http import HttpResponse
from django.shortcuts import render
from swh.web.common import service
from swh.web.config import get_config
+from swh.web.misc.metrics import prometheus_metrics
def _jslicenses(request):
jslicenses_file = finders.find('jssources/jslicenses.json')
jslicenses_data = json.load(open(jslicenses_file))
jslicenses_data = sorted(jslicenses_data.items(),
key=lambda item: item[0].split('/')[-1])
return render(request, "misc/jslicenses.html",
{'jslicenses_data': jslicenses_data})
def _stat_counters(request):
stat = service.stat_counters()
url = get_config()['history_counters_url']
stat_counters_history = 'null'
if url:
try:
response = requests.get(url, timeout=5)
stat_counters_history = response.text
except Exception as exc:
sentry_sdk.capture_exception(exc)
json_data = '{"stat_counters": %s, "stat_counters_history": %s}' % (
json.dumps(stat), stat_counters_history)
return HttpResponse(json_data, content_type='application/json')
urlpatterns = [
url(r'^', include('swh.web.misc.coverage')),
url(r'^jslicenses/$', _jslicenses, name='jslicenses'),
url(r'^', include('swh.web.misc.origin_save')),
- url(r'^stat_counters', _stat_counters, name='stat-counters'),
+ url(r'^stat_counters/', _stat_counters, name='stat-counters'),
url(r'^', include('swh.web.misc.badges')),
+ url(r'^metrics/prometheus/$', prometheus_metrics,
+ name='metrics-prometheus'),
]
# when running end to end tests trough cypress, declare some extra
# endpoints to provide input data for some of those tests
if get_config()['e2e_tests_mode']:
from swh.web.tests.data import (
get_content_code_data_by_ext,
get_content_other_data_by_ext,
get_content_code_data_all_exts,
get_content_code_data_by_filename,
get_content_code_data_all_filenames,
) # noqa
urlpatterns.append(
url(r'^tests/data/content/code/extension/(?P<ext>.+)/$',
get_content_code_data_by_ext,
name='tests-content-code-extension'))
urlpatterns.append(
url(r'^tests/data/content/other/extension/(?P<ext>.+)/$',
get_content_other_data_by_ext,
name='tests-content-other-extension'))
urlpatterns.append(url(r'^tests/data/content/code/extensions/$',
get_content_code_data_all_exts,
name='tests-content-code-extensions'))
urlpatterns.append(
url(r'^tests/data/content/code/filename/(?P<filename>.+)/$',
get_content_code_data_by_filename,
name='tests-content-code-filename'))
urlpatterns.append(url(r'^tests/data/content/code/filenames/$',
get_content_code_data_all_filenames,
name='tests-content-code-filenames'))
diff --git a/swh/web/tests/misc/test_metrics.py b/swh/web/tests/misc/test_metrics.py
new file mode 100644
index 00000000..8d0cd386
--- /dev/null
+++ b/swh/web/tests/misc/test_metrics.py
@@ -0,0 +1,79 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU Affero General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from itertools import product
+import random
+
+from prometheus_client.exposition import CONTENT_TYPE_LATEST
+
+import pytest
+
+from swh.web.common.models import (
+ SaveOriginRequest,
+ SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING,
+ SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED,
+ SAVE_TASK_SUCCEED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING,
+ SAVE_TASK_NOT_CREATED
+)
+from swh.web.common.origin_save import (
+ get_savable_visit_types, ACCEPTED_SAVE_REQUESTS_METRIC,
+ SUBMITTED_SAVE_REQUESTS_METRIC
+)
+from swh.web.common.utils import reverse
+from swh.web.tests.django_asserts import assert_contains
+
+
+@pytest.mark.django_db
+def test_origin_save_metrics(client):
+ visit_types = get_savable_visit_types()
+ request_statuses = (SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED,
+ SAVE_REQUEST_PENDING)
+
+ load_task_statuses = (SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED,
+ SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEED,
+ SAVE_TASK_FAILED, SAVE_TASK_RUNNING)
+
+ for _ in range(random.randint(50, 100)):
+ visit_type = random.choice(visit_types)
+ request_satus = random.choice(request_statuses)
+ load_task_status = random.choice(load_task_statuses)
+
+ SaveOriginRequest.objects.create(origin_url='origin',
+ visit_type=visit_type,
+ status=request_satus,
+ loading_task_status=load_task_status)
+
+ url = reverse('metrics-prometheus')
+ resp = client.get(url)
+
+ assert resp.status_code == 200
+ assert resp['Content-Type'] == CONTENT_TYPE_LATEST
+
+ accepted_requests = SaveOriginRequest.objects.filter(
+ status=SAVE_REQUEST_ACCEPTED)
+
+ labels_set = product(visit_types, load_task_statuses)
+
+ for labels in labels_set:
+ sor_count = accepted_requests.filter(
+ visit_type=labels[0], loading_task_status=labels[1]).count()
+
+ metric_text = (f'{ACCEPTED_SAVE_REQUESTS_METRIC}{{'
+ f'load_task_status="{labels[1]}",'
+ f'visit_type="{labels[0]}"}} {float(sor_count)}\n')
+
+ assert_contains(resp, metric_text)
+
+ labels_set = product(visit_types, request_statuses)
+
+ for labels in labels_set:
+ sor_count = SaveOriginRequest.objects.filter(
+ visit_type=labels[0], status=labels[1]).count()
+
+ metric_text = (f'{SUBMITTED_SAVE_REQUESTS_METRIC}{{'
+ f'status="{labels[1]}",'
+ f'visit_type="{labels[0]}"}} {float(sor_count)}\n')
+
+ assert_contains(resp, metric_text)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Sep 18, 4:56 PM (1 d, 21 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3292591
Attached To
rDWAPPS Web applications
Event Timeline
Log In to Comment