diff --git a/swh/web/config.py b/swh/web/config.py index 4f510091..b1cc8dbd 100644 --- a/swh/web/config.py +++ b/swh/web/config.py @@ -1,237 +1,238 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.counters import get_counters from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings SWH_WEB_SERVER_NAME = "archive.softwareheritage.org" SWH_WEB_INTERNAL_SERVER_NAME = "archive.internal.softwareheritage.org" SWH_WEB_STAGING_SERVER_NAMES = [ "webapp.staging.swh.network", "webapp.internal.staging.swh.network", ] SETTINGS_DIR = os.path.dirname(settings.__file__) DEFAULT_CONFIG = { "allowed_hosts": ("list", []), "storage": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5002/", "timeout": 10, }, ), "indexer_storage": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5007/", "timeout": 1, }, ), "counters": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5011/", "timeout": 1, }, ), "search": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5010/", "timeout": 10, }, ), "search_config": ( "dict", { "metadata_backend": "swh-indexer-storage", }, # or "swh-search" ), "log_dir": ("string", "/tmp/swh/log"), "debug": ("bool", False), "serve_assets": ("bool", False), "host": ("string", "127.0.0.1"), "port": ("int", 5004), "secret_key": ("string", "development key"), # do not display code highlighting for content > 1MB "content_display_max_size": ("int", 5 * 1024 * 1024), "snapshot_content_max_size": ("int", 1000), "throttling": ( "dict", { "cache_uri": None, # production: memcached as cache (127.0.0.1:11211) # development: in-memory cache so None "scopes": { "swh_api": { "limiter_rate": {"default": "120/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_search": { "limiter_rate": {"default": "10/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_vault_cooking": { "limiter_rate": {"default": "120/h", "GET": "60/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_save_origin": { "limiter_rate": {"default": "120/h", "POST": "10/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_visit_latest": { "limiter_rate": {"default": "700/m"}, "exempted_networks": ["127.0.0.0/8"], }, }, }, ), "vault": ( "dict", { "cls": "remote", "args": { "url": "http://127.0.0.1:5005/", }, }, ), "scheduler": ("dict", {"cls": "remote", "url": "http://127.0.0.1:5008/"}), "development_db": ("string", os.path.join(SETTINGS_DIR, "db.sqlite3")), "test_db": ("dict", {"name": "swh-web-test"}), "production_db": ("dict", {"name": "swh-web"}), "deposit": ( "dict", { "private_api_url": "https://deposit.softwareheritage.org/1/private/", "private_api_user": "swhworker", "private_api_password": "some-password", }, ), "e2e_tests_mode": ("bool", False), "es_workers_index_url": ("string", ""), "history_counters_url": ( "string", ( "http://counters1.internal.softwareheritage.org:5011" "/counters_history/history.json" ), ), "client_config": ("dict", {}), "keycloak": ("dict", {"server_url": "", "realm_name": ""}), "graph": ( "dict", { "server_url": "http://graph.internal.softwareheritage.org:5009/graph/", "max_edges": {"staff": 0, "user": 100000, "anonymous": 1000}, }, ), "status": ( "dict", { "server_url": "https://status.softwareheritage.org/", "json_path": "1.0/status/578e5eddcdc0cc7951000520", }, ), "counters_backend": ("string", "swh-storage"), # or "swh-counters" "staging_server_names": ("list", SWH_WEB_STAGING_SERVER_NAMES), "instance_name": ("str", "archive-test.softwareheritage.org"), "give": ("dict", {"public_key": "", "token": ""}), "features": ("dict", {"add_forge_now": True}), "add_forge_now": ("dict", {"email_address": "add-forge-now@example.com"}), "swh_extra_django_apps": ( "list", [ "swh.web.inbound_email", "swh.web.add_forge_now", "swh.web.mailmap", "swh.web.save_code_now", "swh.web.deposit", "swh.web.badges", "swh.web.archive_coverage", + "swh.web.metrics", ], ), } swhweb_config: Dict[str, Any] = {} def get_config(config_file="web/web"): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get("SWH_CONFIG_FILENAME") if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, "log_dir") if swhweb_config.get("search"): swhweb_config["search"] = get_search(**swhweb_config["search"]) else: swhweb_config["search"] = None swhweb_config["storage"] = get_storage(**swhweb_config["storage"]) swhweb_config["vault"] = get_vault(**swhweb_config["vault"]) swhweb_config["indexer_storage"] = get_indexer_storage( **swhweb_config["indexer_storage"] ) swhweb_config["scheduler"] = get_scheduler(**swhweb_config["scheduler"]) swhweb_config["counters"] = get_counters(**swhweb_config["counters"]) return swhweb_config def search(): """Return the current application's search.""" return get_config()["search"] def storage(): """Return the current application's storage.""" return get_config()["storage"] def vault(): """Return the current application's vault.""" return get_config()["vault"] def indexer_storage(): """Return the current application's indexer storage.""" return get_config()["indexer_storage"] def scheduler(): """Return the current application's scheduler.""" return get_config()["scheduler"] def counters(): """Return the current application's counters.""" return get_config()["counters"] diff --git a/swh/web/metrics/__init__.py b/swh/web/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swh/web/metrics/prometheus.py b/swh/web/metrics/prometheus.py new file mode 100644 index 00000000..3a26667b --- /dev/null +++ b/swh/web/metrics/prometheus.py @@ -0,0 +1,124 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from itertools import product + +from prometheus_client import Gauge +from prometheus_client.registry import CollectorRegistry + +from swh.web.save_code_now.models import ( + SAVE_REQUEST_ACCEPTED, + SAVE_REQUEST_PENDING, + SAVE_REQUEST_REJECTED, + SAVE_TASK_FAILED, + SAVE_TASK_NOT_CREATED, + SAVE_TASK_NOT_YET_SCHEDULED, + SAVE_TASK_RUNNING, + SAVE_TASK_SCHEDULED, + SAVE_TASK_SUCCEEDED, + SaveOriginRequest, +) +from swh.web.save_code_now.origin_save import get_savable_visit_types + +SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True) + +SUBMITTED_SAVE_REQUESTS_METRIC = "swh_web_submitted_save_requests" + +_submitted_save_requests_gauge = Gauge( + name=SUBMITTED_SAVE_REQUESTS_METRIC, + documentation="Number of submitted origin save requests", + labelnames=["status", "visit_type"], + registry=SWH_WEB_METRICS_REGISTRY, +) + + +ACCEPTED_SAVE_REQUESTS_METRIC = "swh_web_accepted_save_requests" + +_accepted_save_requests_gauge = Gauge( + name=ACCEPTED_SAVE_REQUESTS_METRIC, + documentation="Number of accepted origin save requests", + labelnames=["load_task_status", "visit_type"], + registry=SWH_WEB_METRICS_REGISTRY, +) + + +# Metric on the delay of save code now request per status and visit_type. This is the +# time difference between the save code now is requested and the time it got ingested. +ACCEPTED_SAVE_REQUESTS_DELAY_METRIC = "swh_web_save_requests_delay_seconds" + +_accepted_save_requests_delay_gauge = Gauge( + name=ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, + documentation="Save Requests Duration", + labelnames=["load_task_status", "visit_type"], + registry=SWH_WEB_METRICS_REGISTRY, +) + + +def compute_save_requests_metrics() -> None: + """Compute Prometheus metrics related to origin save requests: + + - Number of submitted origin save requests + - Number of accepted origin save requests + - Save Code Now requests delay between request time and actual time of ingestion + + """ + + request_statuses = ( + SAVE_REQUEST_ACCEPTED, + SAVE_REQUEST_REJECTED, + SAVE_REQUEST_PENDING, + ) + + load_task_statuses = ( + SAVE_TASK_NOT_CREATED, + SAVE_TASK_NOT_YET_SCHEDULED, + SAVE_TASK_SCHEDULED, + SAVE_TASK_SUCCEEDED, + SAVE_TASK_FAILED, + SAVE_TASK_RUNNING, + ) + + # for metrics, we want access to all visit types + visit_types = get_savable_visit_types(privileged_user=True) + + labels_set = product(request_statuses, visit_types) + + for labels in labels_set: + _submitted_save_requests_gauge.labels(*labels).set(0) + + labels_set = product(load_task_statuses, visit_types) + + for labels in labels_set: + _accepted_save_requests_gauge.labels(*labels).set(0) + + duration_load_task_statuses = ( + SAVE_TASK_FAILED, + SAVE_TASK_SUCCEEDED, + ) + + for labels in product(duration_load_task_statuses, visit_types): + _accepted_save_requests_delay_gauge.labels(*labels).set(0) + + for sor in SaveOriginRequest.objects.all(): + if sor.status == SAVE_REQUEST_ACCEPTED: + _accepted_save_requests_gauge.labels( + load_task_status=sor.loading_task_status, + visit_type=sor.visit_type, + ).inc() + + _submitted_save_requests_gauge.labels( + status=sor.status, visit_type=sor.visit_type + ).inc() + + if ( + sor.loading_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED) + and sor.visit_date is not None + and sor.request_date is not None + ): + delay = sor.visit_date.timestamp() - sor.request_date.timestamp() + _accepted_save_requests_delay_gauge.labels( + load_task_status=sor.loading_task_status, + visit_type=sor.visit_type, + ).inc(delay) diff --git a/swh/web/metrics/urls.py b/swh/web/metrics/urls.py new file mode 100644 index 00000000..db3441d5 --- /dev/null +++ b/swh/web/metrics/urls.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from django.urls import re_path as url + +from swh.web.metrics.views import prometheus_metrics + +urlpatterns = [ + url(r"^metrics/prometheus/$", prometheus_metrics, name="metrics-prometheus"), +] diff --git a/swh/web/misc/metrics.py b/swh/web/metrics/views.py similarity index 74% rename from swh/web/misc/metrics.py rename to swh/web/metrics/views.py index 29acc9dc..f0509bd2 100644 --- a/swh/web/misc/metrics.py +++ b/swh/web/metrics/views.py @@ -1,21 +1,23 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from prometheus_client.exposition import CONTENT_TYPE_LATEST, generate_latest from django.http import HttpResponse -from swh.web.save_code_now.origin_save import compute_save_requests_metrics -from swh.web.utils import SWH_WEB_METRICS_REGISTRY +from swh.web.metrics.prometheus import ( + SWH_WEB_METRICS_REGISTRY, + compute_save_requests_metrics, +) def prometheus_metrics(request): compute_save_requests_metrics() return HttpResponse( content=generate_latest(registry=SWH_WEB_METRICS_REGISTRY), content_type=CONTENT_TYPE_LATEST, ) diff --git a/swh/web/misc/urls.py b/swh/web/misc/urls.py index 7c12bd6c..292627e8 100644 --- a/swh/web/misc/urls.py +++ b/swh/web/misc/urls.py @@ -1,117 +1,116 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import requests from django.conf.urls import include from django.contrib.staticfiles import finders from django.http import JsonResponse from django.shortcuts import render from django.urls import re_path as url from django.views.decorators.clickjacking import xframe_options_exempt from swh.web.config import get_config -from swh.web.misc.metrics import prometheus_metrics from swh.web.utils import archive from swh.web.utils.exc import sentry_capture_exception def _jslicenses(request): jslicenses_file = finders.find("jssources/jslicenses.json") jslicenses_data = json.load(open(jslicenses_file)) jslicenses_data = sorted( jslicenses_data.items(), key=lambda item: item[0].split("/")[-1] ) return render(request, "misc/jslicenses.html", {"jslicenses_data": jslicenses_data}) def _stat_counters(request): stat_counters = archive.stat_counters() url = get_config()["history_counters_url"] stat_counters_history = {} try: response = requests.get(url, timeout=5) stat_counters_history = json.loads(response.text) except Exception as exc: sentry_capture_exception(exc) counters = { "stat_counters": stat_counters, "stat_counters_history": stat_counters_history, } return JsonResponse(counters) @xframe_options_exempt def hiring_banner(request): lang = request.GET.get("lang") return render( request, "misc/hiring-banner-iframe.html", { "lang": lang if lang else "en", }, ) urlpatterns = [ url(r"^jslicenses/$", _jslicenses, name="jslicenses"), url(r"^stat_counters/$", _stat_counters, name="stat-counters"), - url(r"^metrics/prometheus/$", prometheus_metrics, name="metrics-prometheus"), url(r"^", include("swh.web.misc.fundraising")), url(r"^hiring/banner/$", hiring_banner, name="swh-hiring-banner"), ] # when running end to end tests through cypress, declare some extra # endpoints to provide input data for some of those tests if get_config()["e2e_tests_mode"]: + from swh.web.tests.views import ( get_content_code_data_all_exts, get_content_code_data_all_filenames, get_content_code_data_by_ext, get_content_code_data_by_filename, get_content_other_data_by_ext, ) urlpatterns.append( url( r"^tests/data/content/code/extension/(?P.+)/$", get_content_code_data_by_ext, name="tests-content-code-extension", ) ) urlpatterns.append( url( r"^tests/data/content/other/extension/(?P.+)/$", get_content_other_data_by_ext, name="tests-content-other-extension", ) ) urlpatterns.append( url( r"^tests/data/content/code/extensions/$", get_content_code_data_all_exts, name="tests-content-code-extensions", ) ) urlpatterns.append( url( r"^tests/data/content/code/filename/(?P.+)/$", get_content_code_data_by_filename, name="tests-content-code-filename", ) ) urlpatterns.append( url( r"^tests/data/content/code/filenames/$", get_content_code_data_all_filenames, name="tests-content-code-filenames", ) ) diff --git a/swh/web/save_code_now/origin_save.py b/swh/web/save_code_now/origin_save.py index b2883f03..a7b3bfe9 100644 --- a/swh/web/save_code_now/origin_save.py +++ b/swh/web/save_code_now/origin_save.py @@ -1,941 +1,838 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from functools import lru_cache -from itertools import product import json import logging from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse -from prometheus_client import Gauge import requests from django.core.exceptions import ObjectDoesNotExist, ValidationError from django.core.validators import URLValidator from django.db.models import Q, QuerySet from django.utils.html import escape from swh.scheduler.utils import create_oneshot_task_dict from swh.web.config import get_config, scheduler from swh.web.save_code_now.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, - SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) -from swh.web.utils import SWH_WEB_METRICS_REGISTRY, archive, parse_iso8601_date_to_utc +from swh.web.utils import archive, parse_iso8601_date_to_utc from swh.web.utils.exc import ( BadInputExc, ForbiddenExc, NotFoundExc, sentry_capture_exception, ) from swh.web.utils.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo logger = logging.getLogger(__name__) # Number of days in the past to lookup for information MAX_THRESHOLD_DAYS = 30 # Non terminal visit statuses which needs updates NON_TERMINAL_STATUSES = [ VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, ] def get_origin_save_authorized_urls() -> List[str]: """ Get the list of origin url prefixes authorized to be immediately loaded into the archive (whitelist). Returns: list: The list of authorized origin url prefix """ return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] def get_origin_save_unauthorized_urls() -> List[str]: """ Get the list of origin url prefixes forbidden to be loaded into the archive (blacklist). Returns: list: the list of unauthorized origin url prefix """ return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] def can_save_origin(origin_url: str, bypass_pending_review: bool = False) -> str: """ Check if a software origin can be saved into the archive. Based on the origin url, the save request will be either: * immediately accepted if the url is whitelisted * rejected if the url is blacklisted * put in pending state for manual review otherwise Args: origin_url (str): the software origin url to check Returns: str: the origin save request status, either **accepted**, **rejected** or **pending** """ # origin url may be blacklisted for url_prefix in get_origin_save_unauthorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_REJECTED # if the origin url is in the white list, it can be immediately saved for url_prefix in get_origin_save_authorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_ACCEPTED # otherwise, the origin url needs to be manually verified if the user # that submitted it does not have special permission if bypass_pending_review: # mark the origin URL as trusted in that case SaveAuthorizedOrigin.objects.get_or_create(url=origin_url) return SAVE_REQUEST_ACCEPTED else: return SAVE_REQUEST_PENDING # map visit type to scheduler task # TODO: do not hardcode the task name here (T1157) _visit_type_task = { "git": "load-git", "hg": "load-hg", "svn": "load-svn", "cvs": "load-cvs", "bzr": "load-bzr", } _visit_type_task_privileged = { "archives": "load-archive-files", } # map scheduler task status to origin save status _save_task_status = { "next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED, "next_run_scheduled": SAVE_TASK_SCHEDULED, "completed": SAVE_TASK_SUCCEEDED, "disabled": SAVE_TASK_FAILED, } # map scheduler task_run status to origin save status _save_task_run_status = { "scheduled": SAVE_TASK_SCHEDULED, "started": SAVE_TASK_RUNNING, "eventful": SAVE_TASK_SUCCEEDED, "uneventful": SAVE_TASK_SUCCEEDED, "failed": SAVE_TASK_FAILED, "permfailed": SAVE_TASK_FAILED, "lost": SAVE_TASK_FAILED, } @lru_cache() def get_scheduler_load_task_types() -> List[str]: task_types = scheduler().get_task_types() return [t["type"] for t in task_types if t["type"].startswith("load")] def get_savable_visit_types_dict(privileged_user: bool = False) -> Dict: """Returned the supported task types the user has access to. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the dict of supported visit types for the user """ if privileged_user: task_types = {**_visit_type_task, **_visit_type_task_privileged} else: task_types = _visit_type_task # filter visit types according to scheduler load task types if available try: load_task_types = get_scheduler_load_task_types() return {k: v for k, v in task_types.items() if v in load_task_types} except Exception: return task_types def get_savable_visit_types(privileged_user: bool = False) -> List[str]: """Return the list of visit types the user can perform save requests on. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the list of saveable visit types """ return sorted(list(get_savable_visit_types_dict(privileged_user).keys())) def _check_visit_type_savable(visit_type: str, privileged_user: bool = False) -> None: visit_type_tasks = get_savable_visit_types(privileged_user) if visit_type not in visit_type_tasks: allowed_visit_types = ", ".join(visit_type_tasks) raise BadInputExc( f"Visit of type {visit_type} can not be saved! " f"Allowed types are the following: {allowed_visit_types}" ) _validate_url = URLValidator( schemes=["http", "https", "svn", "git", "rsync", "pserver", "ssh", "bzr"] ) def _check_origin_url_valid(origin_url: str) -> None: try: _validate_url(origin_url) except ValidationError: raise BadInputExc( f"The provided origin url ({escape(origin_url)}) is not valid!" ) parsed_url = urlparse(origin_url) if parsed_url.password not in (None, "", "anonymous"): raise BadInputExc( "The provided origin url contains a password and cannot be " "accepted for security reasons." ) def origin_exists(origin_url: str) -> OriginExistenceCheckInfo: """Check the origin url for existence. If it exists, extract some more useful information on the origin. """ resp = requests.head(origin_url, allow_redirects=True) exists = resp.ok content_length: Optional[int] = None last_modified: Optional[str] = None if exists: # Also process X-Archive-Orig-* headers in case the URL targets the # Internet Archive. size_ = resp.headers.get( "Content-Length", resp.headers.get("X-Archive-Orig-Content-Length") ) content_length = int(size_) if size_ else None try: date_str = resp.headers.get( "Last-Modified", resp.headers.get("X-Archive-Orig-Last-Modified", "") ) date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z") last_modified = date.isoformat() except ValueError: # if not provided or not parsable as per the expected format, keep it None pass return OriginExistenceCheckInfo( origin_url=origin_url, exists=exists, last_modified=last_modified, content_length=content_length, ) def _check_origin_exists(url: str) -> OriginExistenceCheckInfo: """Ensure an URL exists, if not raise an explicit message.""" metadata = origin_exists(url) if not metadata["exists"]: raise BadInputExc(f"The provided url ({escape(url)}) does not exist!") return metadata def _get_visit_info_for_save_request( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str]]: """Retrieve visit information out of a save request Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (visit date, optional visit status) for such save request origin """ visit_date = None visit_status = None time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # stop trying to find a visit date one month after save request submission # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: origin = save_request.origin_url ovs = archive.origin_visit_find_by_date(origin, save_request.request_date) if ovs: visit_date = parse_iso8601_date_to_utc(ovs["date"]) visit_status = ovs["status"] return visit_date, visit_status def _check_visit_update_status( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (optional visit date, optional visit status, optional save task status) for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: loading_task_status = SAVE_TASK_FAILED return visit_date, visit_status, loading_task_status def _compute_task_loading_status( task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> Optional[str]: loading_task_status: Optional[str] = None # First determine the loading task status out of task information if task: loading_task_status = _save_task_status[task["status"]] if task_run: loading_task_status = _save_task_run_status[task_run["status"]] return loading_task_status def _update_save_request_info( save_request: SaveOriginRequest, task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> SaveOriginRequestInfo: """Update save request information out of the visit status and fallback to the task and task_run information if the visit status is missing. Args: save_request: Save request task: Associated scheduler task information about the save request task_run: Most recent run occurrence of the associated task Returns: Summary of the save request information updated. """ must_save = False # To determine the save code now request's final status, the visit date must be set # and the visit status must be a final one. Once they do, the save code now is # definitely done. if ( not save_request.visit_date or not save_request.visit_status or save_request.visit_status in NON_TERMINAL_STATUSES ): visit_date, visit_status, loading_task_status = _check_visit_update_status( save_request ) if not loading_task_status: # fallback when not provided loading_task_status = _compute_task_loading_status(task, task_run) if visit_date != save_request.visit_date: must_save = True save_request.visit_date = visit_date if visit_status != save_request.visit_status: must_save = True save_request.visit_status = visit_status if ( loading_task_status is not None and loading_task_status != save_request.loading_task_status ): must_save = True save_request.loading_task_status = loading_task_status if must_save: save_request.save() return save_request.to_dict() def create_save_origin_request( visit_type: str, origin_url: str, privileged_user: bool = False, user_id: Optional[int] = None, **kwargs, ) -> SaveOriginRequestInfo: """Create a loading task to save a software origin into the archive. This function aims to create a software origin loading task through the use of the swh-scheduler component. First, some checks are performed to see if the visit type and origin url are valid but also if the the save request can be accepted. For the 'archives' visit type, this also ensures the artifacts actually exists. If those checks passed, the loading task is then created. Otherwise, the save request is put in pending or rejected state. All the submitted save requests are logged into the swh-web database to keep track of them. Args: visit_type: the type of visit to perform (e.g. git, hg, svn, archives, ...) origin_url: the url of the origin to save privileged: Whether the user has some more privilege than other (bypass review, access to privileged other visit types) user_id: User identifier (provided when authenticated) kwargs: Optional parameters (e.g. artifact_url, artifact_filename, artifact_version) Raises: BadInputExc: the visit type or origin url is invalid or inexistent ForbiddenExc: the provided origin url is blacklisted Returns: dict: A dict describing the save request with the following keys: * **visit_type**: the type of visit to perform * **origin_url**: the url of the origin * **save_request_date**: the date the request was submitted * **save_request_status**: the request status, either **accepted**, **rejected** or **pending** * **save_task_status**: the origin loading task status, either **not created**, **not yet scheduled**, **scheduled**, **succeed** or **failed** """ visit_type_tasks = get_savable_visit_types_dict(privileged_user) _check_visit_type_savable(visit_type, privileged_user) _check_origin_url_valid(origin_url) # if all checks passed so far, we can try and save the origin save_request_status = can_save_origin(origin_url, privileged_user) task = None # if the origin save request is accepted, create a scheduler # task to load it into the archive if save_request_status == SAVE_REQUEST_ACCEPTED: # create a task with high priority task_kwargs: Dict[str, Any] = { "priority": "high", "url": origin_url, } if visit_type == "archives": # extra arguments for that type are required archives_data = kwargs.get("archives_data", []) if not archives_data: raise BadInputExc( "Artifacts data are missing for the archives visit type." ) artifacts = [] for artifact in archives_data: artifact_url = artifact.get("artifact_url") artifact_version = artifact.get("artifact_version") if not artifact_url or not artifact_version: raise BadInputExc("Missing url or version for an artifact to load.") metadata = _check_origin_exists(artifact_url) artifacts.append( { "url": artifact_url, "version": artifact_version, "time": metadata["last_modified"], "length": metadata["content_length"], } ) task_kwargs = dict(**task_kwargs, artifacts=artifacts, snapshot_append=True) sor = None # get list of previously submitted save requests (most recent first) current_sors = list( SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ).order_by("-request_date") ) can_create_task = False # if no save requests previously submitted, create the scheduler task if not current_sors: can_create_task = True else: # get the latest submitted save request sor = current_sors[0] # if it was in pending state, we need to create the scheduler task # and update the save request info in the database if sor.status == SAVE_REQUEST_PENDING: can_create_task = True # a task has already been created to load the origin elif sor.loading_task_id != -1: # get the scheduler task and its status tasks = scheduler().get_tasks([sor.loading_task_id]) task = tasks[0] if tasks else None task_runs = scheduler().get_task_runs([sor.loading_task_id]) task_run = task_runs[0] if task_runs else None save_request_info = _update_save_request_info(sor, task, task_run) task_status = save_request_info["save_task_status"] # create a new scheduler task only if the previous one has been # already executed if ( task_status == SAVE_TASK_FAILED or task_status == SAVE_TASK_SUCCEEDED ): can_create_task = True sor = None else: can_create_task = False if can_create_task: # effectively create the scheduler task task_dict = create_oneshot_task_dict( visit_type_tasks[visit_type], **task_kwargs ) task = scheduler().create_tasks([task_dict])[0] # pending save request has been accepted if sor: sor.status = SAVE_REQUEST_ACCEPTED sor.loading_task_id = task["id"] sor.save() else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, loading_task_id=task["id"], user_ids=f'"{user_id}"' if user_id else None, ) # save request must be manually reviewed for acceptation elif save_request_status == SAVE_REQUEST_PENDING: # check if there is already such a save request already submitted, # no need to add it to the database in that case try: sor = SaveOriginRequest.objects.get( visit_type=visit_type, origin_url=origin_url, status=save_request_status ) user_ids = sor.user_ids if sor.user_ids is not None else "" if user_id is not None and f'"{user_id}"' not in user_ids: # update user ids list sor.user_ids = f'{sor.user_ids},"{user_id}"' sor.save() # if not add it to the database except ObjectDoesNotExist: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, ) # origin can not be saved as its url is blacklisted, # log the request to the database anyway else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, ) if save_request_status == SAVE_REQUEST_REJECTED: raise ForbiddenExc( ( 'The "save code now" request has been rejected ' "because the provided origin url is blacklisted." ) ) assert sor is not None return _update_save_request_info(sor, task) def update_save_origin_requests_from_queryset( requests_queryset: QuerySet, ) -> List[SaveOriginRequestInfo]: """Update all save requests from a SaveOriginRequest queryset, update their status in db and return the list of impacted save_requests. Args: requests_queryset: input SaveOriginRequest queryset Returns: list: A list of save origin request info dicts as described in :func:`swh.web.save_code_now.origin_save.create_save_origin_request` """ task_ids = [] for sor in requests_queryset: task_ids.append(sor.loading_task_id) save_requests = [] if task_ids: try: tasks = scheduler().get_tasks(task_ids) tasks = {task["id"]: task for task in tasks} task_runs = scheduler().get_task_runs(tasks) task_runs = {task_run["task"]: task_run for task_run in task_runs} except Exception: # allow to avoid mocking api GET responses for /origin/save endpoint when # running cypress tests as scheduler is not available tasks = {} task_runs = {} for sor in requests_queryset: sr_dict = _update_save_request_info( sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id), ) save_requests.append(sr_dict) return save_requests def refresh_save_origin_request_statuses() -> List[SaveOriginRequestInfo]: """Refresh non-terminal save origin requests (SOR) in the backend. Non-terminal SOR are requests whose status is **accepted** and their task status are either **created**, **not yet scheduled**, **scheduled** or **running**. This shall compute this list of SOR, checks their status in the scheduler and optionally elasticsearch for their current status. Then update those in db. Finally, this returns the refreshed information on those SOR. """ pivot_date = datetime.now(tz=timezone.utc) - timedelta(days=MAX_THRESHOLD_DAYS) save_requests = SaveOriginRequest.objects.filter( # Retrieve accepted request statuses (all statuses) Q(status=SAVE_REQUEST_ACCEPTED), # those without the required information we need to update Q(visit_date__isnull=True) | Q(visit_status__isnull=True) | Q(visit_status__in=NON_TERMINAL_STATUSES), # limit results to recent ones (that is roughly 30 days old at best) Q(request_date__gte=pivot_date), ) return ( update_save_origin_requests_from_queryset(save_requests) if save_requests.count() > 0 else [] ) def get_save_origin_requests( visit_type: str, origin_url: str ) -> List[SaveOriginRequestInfo]: """ Get all save requests for a given software origin. Args: visit_type: the type of visit origin_url: the url of the origin Raises: BadInputExc: the visit type or origin url is invalid swh.web.utils.exc.NotFoundExc: no save requests can be found for the given origin Returns: list: A list of save origin requests dict as described in :func:`swh.web.save_code_now.origin_save.create_save_origin_request` """ _check_visit_type_savable(visit_type) _check_origin_url_valid(origin_url) sors = SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ) if sors.count() == 0: raise NotFoundExc( f"No save requests found for visit of type {visit_type} " f"on origin with url {origin_url}." ) return update_save_origin_requests_from_queryset(sors) def get_save_origin_task_info( save_request_id: int, full_info: bool = True ) -> Dict[str, Any]: """ Get detailed information about an accepted save origin request and its associated loading task. If the associated loading task info is archived and removed from the scheduler database, returns an empty dictionary. Args: save_request_id: identifier of a save origin request full_info: whether to return detailed info for staff users Returns: A dictionary with the following keys: - **type**: loading task type - **arguments**: loading task arguments - **id**: loading task database identifier - **backend_id**: loading task celery identifier - **scheduled**: loading task scheduling date - **ended**: loading task termination date - **status**: loading task execution status - **visit_status**: Actual visit status Depending on the availability of the task logs in the elasticsearch cluster of Software Heritage, the returned dictionary may also contain the following keys: - **name**: associated celery task name - **message**: relevant log message from task execution - **duration**: task execution time (only if it succeeded) - **worker**: name of the worker that executed the task """ try: save_request = SaveOriginRequest.objects.get(id=save_request_id) except ObjectDoesNotExist: return {} task_info: Dict[str, Any] = {} if save_request.note is not None: task_info["note"] = save_request.note try: task = scheduler().get_tasks([save_request.loading_task_id]) except Exception: # to avoid mocking GET responses of /save/task/info/ endpoint when running # cypress tests as scheduler is not available in that case task = None task = task[0] if task else None if task is None: return task_info task_run = scheduler().get_task_runs([task["id"]]) task_run = task_run[0] if task_run else None if task_run is None: return task_info task_info.update(task_run) task_info["type"] = task["type"] task_info["arguments"] = task["arguments"] task_info["id"] = task_run["task"] del task_info["task"] del task_info["metadata"] # Enrich the task info with the loading visit status task_info["visit_status"] = save_request.visit_status es_workers_index_url = get_config()["es_workers_index_url"] if not es_workers_index_url: return task_info es_workers_index_url += "/_search" if save_request.visit_date: min_ts = save_request.visit_date max_ts = min_ts + timedelta(days=7) else: min_ts = save_request.request_date max_ts = min_ts + timedelta(days=MAX_THRESHOLD_DAYS) min_ts_unix = int(min_ts.timestamp()) * 1000 max_ts_unix = int(max_ts.timestamp()) * 1000 save_task_status = _save_task_status[task["status"]] priority = "3" if save_task_status == SAVE_TASK_FAILED else "6" query = { "bool": { "must": [ {"match_phrase": {"syslog.priority": {"query": priority}}}, { "match_phrase": { "journald.custom.swh_task_id": {"query": task_run["backend_id"]} } }, { "range": { "@timestamp": { "gte": min_ts_unix, "lte": max_ts_unix, "format": "epoch_millis", } } }, ] } } try: response = requests.post( es_workers_index_url, json={"query": query, "sort": ["@timestamp"]}, timeout=30, ) results = json.loads(response.text) if results["hits"]["total"]["value"] >= 1: task_run_info = results["hits"]["hits"][-1]["_source"] journald_custom = task_run_info.get("journald", {}).get("custom", {}) task_info["duration"] = journald_custom.get( "swh_logging_args_runtime", "not available" ) task_info["message"] = task_run_info.get("message", "not available") task_info["name"] = journald_custom.get("swh_task_name", "not available") task_info["worker"] = task_run_info.get("host", {}).get("hostname") except Exception as exc: logger.warning("Request to Elasticsearch failed\n%s", exc) sentry_capture_exception(exc) if not full_info: for field in ("id", "backend_id", "worker"): # remove some staff only fields task_info.pop(field, None) if "message" in task_run and "Loading failure" in task_run["message"]: # hide traceback for non staff users, only display exception message_lines = task_info["message"].split("\n") message = "" for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] task_info["message"] = message return task_info - - -SUBMITTED_SAVE_REQUESTS_METRIC = "swh_web_submitted_save_requests" - -_submitted_save_requests_gauge = Gauge( - name=SUBMITTED_SAVE_REQUESTS_METRIC, - documentation="Number of submitted origin save requests", - labelnames=["status", "visit_type"], - registry=SWH_WEB_METRICS_REGISTRY, -) - - -ACCEPTED_SAVE_REQUESTS_METRIC = "swh_web_accepted_save_requests" - -_accepted_save_requests_gauge = Gauge( - name=ACCEPTED_SAVE_REQUESTS_METRIC, - documentation="Number of accepted origin save requests", - labelnames=["load_task_status", "visit_type"], - registry=SWH_WEB_METRICS_REGISTRY, -) - - -# Metric on the delay of save code now request per status and visit_type. This is the -# time difference between the save code now is requested and the time it got ingested. -ACCEPTED_SAVE_REQUESTS_DELAY_METRIC = "swh_web_save_requests_delay_seconds" - -_accepted_save_requests_delay_gauge = Gauge( - name=ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, - documentation="Save Requests Duration", - labelnames=["load_task_status", "visit_type"], - registry=SWH_WEB_METRICS_REGISTRY, -) - - -def compute_save_requests_metrics() -> None: - """Compute Prometheus metrics related to origin save requests: - - - Number of submitted origin save requests - - Number of accepted origin save requests - - Save Code Now requests delay between request time and actual time of ingestion - - """ - - request_statuses = ( - SAVE_REQUEST_ACCEPTED, - SAVE_REQUEST_REJECTED, - SAVE_REQUEST_PENDING, - ) - - load_task_statuses = ( - SAVE_TASK_NOT_CREATED, - SAVE_TASK_NOT_YET_SCHEDULED, - SAVE_TASK_SCHEDULED, - SAVE_TASK_SUCCEEDED, - SAVE_TASK_FAILED, - SAVE_TASK_RUNNING, - ) - - # for metrics, we want access to all visit types - visit_types = get_savable_visit_types(privileged_user=True) - - labels_set = product(request_statuses, visit_types) - - for labels in labels_set: - _submitted_save_requests_gauge.labels(*labels).set(0) - - labels_set = product(load_task_statuses, visit_types) - - for labels in labels_set: - _accepted_save_requests_gauge.labels(*labels).set(0) - - duration_load_task_statuses = ( - SAVE_TASK_FAILED, - SAVE_TASK_SUCCEEDED, - ) - - for labels in product(duration_load_task_statuses, visit_types): - _accepted_save_requests_delay_gauge.labels(*labels).set(0) - - for sor in SaveOriginRequest.objects.all(): - if sor.status == SAVE_REQUEST_ACCEPTED: - _accepted_save_requests_gauge.labels( - load_task_status=sor.loading_task_status, - visit_type=sor.visit_type, - ).inc() - - _submitted_save_requests_gauge.labels( - status=sor.status, visit_type=sor.visit_type - ).inc() - - if ( - sor.loading_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED) - and sor.visit_date is not None - and sor.request_date is not None - ): - delay = sor.visit_date.timestamp() - sor.request_date.timestamp() - _accepted_save_requests_delay_gauge.labels( - load_task_status=sor.loading_task_status, - visit_type=sor.visit_type, - ).inc(delay) diff --git a/swh/web/tests/metrics/__init__.py b/swh/web/tests/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swh/web/tests/misc/test_metrics.py b/swh/web/tests/metrics/test_metrics.py similarity index 98% rename from swh/web/tests/misc/test_metrics.py rename to swh/web/tests/metrics/test_metrics.py index b1282d22..75a9ab7f 100644 --- a/swh/web/tests/misc/test_metrics.py +++ b/swh/web/tests/metrics/test_metrics.py @@ -1,137 +1,137 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta from itertools import product import random from prometheus_client.exposition import CONTENT_TYPE_LATEST import pytest +from swh.web.metrics.prometheus import ( + ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, + ACCEPTED_SAVE_REQUESTS_METRIC, + SUBMITTED_SAVE_REQUESTS_METRIC, + get_savable_visit_types, +) from swh.web.save_code_now.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SaveOriginRequest, ) -from swh.web.save_code_now.origin_save import ( - ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, - ACCEPTED_SAVE_REQUESTS_METRIC, - SUBMITTED_SAVE_REQUESTS_METRIC, - get_savable_visit_types, -) from swh.web.tests.django_asserts import assert_contains from swh.web.tests.helpers import check_http_get_response from swh.web.utils import reverse @pytest.mark.django_db def test_origin_save_metrics(client, swh_scheduler): visit_types = get_savable_visit_types() request_statuses = ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING, ) load_task_statuses = ( SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, ) for _ in range(random.randint(50, 100)): visit_type = random.choice(visit_types) request_satus = random.choice(request_statuses) load_task_status = random.choice(load_task_statuses) sor = SaveOriginRequest.objects.create( origin_url="origin", visit_type=visit_type, status=request_satus, loading_task_status=load_task_status, ) if load_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED): delay = random.choice(range(60)) sor.visit_date = sor.request_date + timedelta(seconds=delay) sor.save() # Note that this injects dates in the future for the sake of the test only url = reverse("metrics-prometheus") resp = check_http_get_response( client, url, status_code=200, content_type=CONTENT_TYPE_LATEST ) accepted_requests = SaveOriginRequest.objects.filter(status=SAVE_REQUEST_ACCEPTED) labels_set = product(visit_types, load_task_statuses) for labels in labels_set: sor_count = accepted_requests.filter( visit_type=labels[0], loading_task_status=labels[1] ).count() metric_text = ( f"{ACCEPTED_SAVE_REQUESTS_METRIC}{{" f'load_task_status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(sor_count)}\n' ) assert_contains(resp, metric_text) labels_set = product(visit_types, request_statuses) for labels in labels_set: sor_count = SaveOriginRequest.objects.filter( visit_type=labels[0], status=labels[1] ).count() metric_text = ( f"{SUBMITTED_SAVE_REQUESTS_METRIC}{{" f'status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(sor_count)}\n' ) assert_contains(resp, metric_text) # delay metrics save_requests = SaveOriginRequest.objects.all() labels_set = product( visit_types, ( SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED, ), ) for labels in labels_set: sors = save_requests.filter( visit_type=labels[0], loading_task_status=labels[1], visit_date__isnull=False, ) delay = 0 for sor in sors: delay += sor.visit_date.timestamp() - sor.request_date.timestamp() metric_delay_text = ( f"{ACCEPTED_SAVE_REQUESTS_DELAY_METRIC}{{" f'load_task_status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(delay)}\n' ) assert_contains(resp, metric_delay_text) diff --git a/swh/web/tests/mailmap/test_app.py b/swh/web/tests/misc/test_app.py similarity index 50% rename from swh/web/tests/mailmap/test_app.py rename to swh/web/tests/misc/test_app.py index 18bc7455..7c40aae8 100644 --- a/swh/web/tests/mailmap/test_app.py +++ b/swh/web/tests/misc/test_app.py @@ -1,32 +1,24 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from django.urls import get_resolver -from swh.web.mailmap.urls import urlpatterns -from swh.web.tests.django_asserts import assert_not_contains -from swh.web.tests.helpers import check_html_get_response -from swh.web.utils import reverse +from swh.web.metrics.urls import urlpatterns @pytest.mark.django_db def test_mailmap_deactivate(client, mailmap_admin, django_settings): - """Check mailmap feature is deactivated when the swh.web.mailmap django + """Check metrics feature is deactivated when the swh.web.metrics django application is not in installed apps.""" django_settings.SWH_DJANGO_APPS = [ - app for app in django_settings.SWH_DJANGO_APPS if app != "swh.web.mailmap" + app for app in django_settings.SWH_DJANGO_APPS if app != "swh.web.metrics" ] - url = reverse("swh-web-homepage") - client.force_login(mailmap_admin) - resp = check_html_get_response(client, url, status_code=200) - assert_not_contains(resp, "swh-mailmap-admin-item") - - mailmap_view_names = set(urlpattern.name for urlpattern in urlpatterns) + metrics_view_names = set(urlpattern.name for urlpattern in urlpatterns) all_view_names = set(get_resolver().reverse_dict.keys()) - assert mailmap_view_names & all_view_names == set() + assert metrics_view_names & all_view_names == set() diff --git a/swh/web/urls.py b/swh/web/urls.py index bd8cb443..528e2efc 100644 --- a/swh/web/urls.py +++ b/swh/web/urls.py @@ -1,77 +1,81 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from importlib.util import find_spec +from typing import List, Union from django_js_reverse.views import urls_js from django.conf import settings from django.conf.urls import handler400, handler403, handler404, handler500, include from django.contrib.staticfiles.views import serve from django.shortcuts import render +from django.urls import URLPattern, URLResolver from django.urls import re_path as url from django.views.generic.base import RedirectView from swh.web.browse.identifiers import swhid_browse from swh.web.config import get_config from swh.web.utils import origin_visit_types from swh.web.utils.exc import swh_handle400, swh_handle403, swh_handle404, swh_handle500 swh_web_config = get_config() favicon_view = RedirectView.as_view( url="/static/img/icons/swh-logo-32x32.png", permanent=True ) def _default_view(request): return render(request, "homepage.html", {"visit_types": origin_visit_types()}) -urlpatterns = [ +urlpatterns: List[Union[URLPattern, URLResolver]] = [] + +# Register URLs for each SWH Django application +for app in settings.SWH_DJANGO_APPS: + app_urls = app + ".urls" + try: + app_urls_spec = find_spec(app_urls) + if app_urls_spec is not None: + urlpatterns.append(url(r"^", include(app_urls))) + except ModuleNotFoundError: + assert False, f"Django application {app} not found !" + +urlpatterns += [ url(r"^favicon\.ico/$", favicon_view), url(r"^$", _default_view, name="swh-web-homepage"), url(r"^jsreverse/$", urls_js, name="js_reverse"), # keep legacy SWHID resolving URL with trailing slash for backward compatibility url( r"^(?P(swh|SWH):[0-9]+:[A-Za-z]+:[0-9A-Fa-f]+.*)/$", swhid_browse, name="browse-swhid-legacy", ), url( r"^(?P(swh|SWH):[0-9]+:[A-Za-z]+:[0-9A-Fa-f]+.*)$", swhid_browse, name="browse-swhid", ), url(r"^", include("swh.web.misc.urls")), ] -# Register URLs for each SWH Django application -for app in settings.SWH_DJANGO_APPS: - app_urls = app + ".urls" - try: - app_urls_spec = find_spec(app_urls) - if app_urls_spec is not None: - urlpatterns.append(url(r"^", include(app_urls))) - except ModuleNotFoundError: - assert False, f"Django application {app} not found !" - # allow to serve assets through django staticfiles # even if settings.DEBUG is False def insecure_serve(request, path, **kwargs): return serve(request, path, insecure=True, **kwargs) # enable to serve compressed assets through django development server if swh_web_config["serve_assets"]: static_pattern = r"^%s(?P.*)/$" % settings.STATIC_URL[1:] urlpatterns.append(url(static_pattern, insecure_serve)) handler400 = swh_handle400 # noqa handler403 = swh_handle403 # noqa handler404 = swh_handle404 # noqa handler500 = swh_handle500 # noqa diff --git a/swh/web/utils/__init__.py b/swh/web/utils/__init__.py index b9d57de6..e68a799c 100644 --- a/swh/web/utils/__init__.py +++ b/swh/web/utils/__init__.py @@ -1,523 +1,520 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import functools import os import re from typing import Any, Callable, Dict, List, Mapping, Optional import urllib.parse from bs4 import BeautifulSoup from docutils.core import publish_parts import docutils.parsers.rst import docutils.utils from docutils.writers.html5_polyglot import HTMLTranslator, Writer from iso8601 import ParseError, parse_date from pkg_resources import get_distribution -from prometheus_client.registry import CollectorRegistry import requests from requests.auth import HTTPBasicAuth from django.conf import settings from django.core.cache import cache from django.core.cache.backends.base import DEFAULT_TIMEOUT from django.http import HttpRequest, QueryDict from django.shortcuts import redirect from django.urls import resolve from django.urls import reverse as django_reverse from swh.web.auth.utils import ( ADD_FORGE_MODERATOR_PERMISSION, ADMIN_LIST_DEPOSIT_PERMISSION, MAILMAP_ADMIN_PERMISSION, ) from swh.web.config import SWH_WEB_SERVER_NAME, get_config, search from swh.web.utils.exc import BadInputExc, sentry_capture_exception -SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True) - SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" swh_object_icons = { "alias": "mdi mdi-star", "branch": "mdi mdi-source-branch", "branches": "mdi mdi-source-branch", "content": "mdi mdi-file-document", "cnt": "mdi mdi-file-document", "directory": "mdi mdi-folder", "dir": "mdi mdi-folder", "origin": "mdi mdi-source-repository", "ori": "mdi mdi-source-repository", "person": "mdi mdi-account", "revisions history": "mdi mdi-history", "release": "mdi mdi-tag", "rel": "mdi mdi-tag", "releases": "mdi mdi-tag", "revision": "mdi mdi-rotate-90 mdi-source-commit", "rev": "mdi mdi-rotate-90 mdi-source-commit", "snapshot": "mdi mdi-camera", "snp": "mdi mdi-camera", "visits": "mdi mdi-calendar-month", } def reverse( viewname: str, url_args: Optional[Dict[str, Any]] = None, query_params: Optional[Mapping[str, Optional[str]]] = None, current_app: Optional[str] = None, urlconf: Optional[str] = None, request: Optional[HttpRequest] = None, ) -> str: """An override of django reverse function supporting query parameters. Args: viewname: the name of the django view from which to compute a url url_args: dictionary of url arguments indexed by their names query_params: dictionary of query parameters to append to the reversed url current_app: the name of the django app tighten to the view urlconf: url configuration module request: build an absolute URI if provided Returns: str: the url of the requested view with processed arguments and query parameters """ if url_args: url_args = {k: v for k, v in url_args.items() if v is not None} url = django_reverse( viewname, urlconf=urlconf, kwargs=url_args, current_app=current_app ) params: Dict[str, str] = {} if query_params: params = {k: v for k, v in query_params.items() if v is not None} if params: query_dict = QueryDict("", mutable=True) query_dict.update(dict(sorted(params.items()))) url += "?" + query_dict.urlencode(safe="/;:") if request is not None: url = request.build_absolute_uri(url) return url def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datetime: datetime in UTC without timezone info """ if date.tzinfo and date.tzinfo != timezone.utc: return date.astimezone(tz=timezone.utc) else: return date def parse_iso8601_date_to_utc(iso_date: str) -> datetime: """Given an ISO 8601 datetime string, parse the result as UTC datetime. Returns: a timezone-aware datetime representing the parsed date Raises: swh.web.utils.exc.BadInputExc: provided date does not respect ISO 8601 format Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - 2007-01-14T20:34:22Z """ try: date = parse_date(iso_date) return datetime_to_utc(date) except ParseError as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r"([0-9a-f]{8})[0-9a-z]{56}" sha1_re = r"([0-9a-f]{8})[0-9a-f]{32}" ret = re.sub(sha256_re, r"\1...", path) return re.sub(sha1_re, r"\1...", ret) def format_utc_iso_date(iso_date, fmt="%d %B %Y, %H:%M:%S UTC"): """Turns a string representation of an ISO 8601 datetime string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: str: a formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_iso8601_date_to_utc(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: list: a list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip("/").split("/") path_from_root = "" for p in sub_paths: path_from_root += "/" + p path_info.append({"name": p, "path": path_from_root.strip("/")}) return path_info def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components ).get_default_values() settings.report_level = report_level document = docutils.utils.new_document("rst-doc", settings=settings) parser.parse(text, document) return document def get_client_ip(request): """ Return the client IP address from an incoming HTTP request. Args: request (django.http.HttpRequest): the incoming HTTP request Returns: str: The client IP address """ x_forwarded_for = request.META.get("HTTP_X_FORWARDED_FOR") if x_forwarded_for: ip = x_forwarded_for.split(",")[0] else: ip = request.META.get("REMOTE_ADDR") return ip def is_swh_web_development(request: HttpRequest) -> bool: """Indicate if we are running a development version of swh-web.""" site_base_url = request.build_absolute_uri("/") return any( host in site_base_url for host in ("localhost", "127.0.0.1", "testserver") ) def is_swh_web_staging(request: HttpRequest) -> bool: """Indicate if we are running a staging version of swh-web.""" config = get_config() site_base_url = request.build_absolute_uri("/") return any( server_name in site_base_url for server_name in config["staging_server_names"] ) def is_swh_web_production(request: HttpRequest) -> bool: """Indicate if we are running the public production version of swh-web.""" return SWH_WEB_SERVER_NAME in request.build_absolute_uri("/") browsers_supported_image_mimes = set( [ "image/gif", "image/png", "image/jpeg", "image/bmp", "image/webp", "image/svg", "image/svg+xml", ] ) def context_processor(request): """ Django context processor used to inject variables in all swh-web templates. """ config = get_config() if ( hasattr(request, "user") and request.user.is_authenticated and not hasattr(request.user, "backend") ): # To avoid django.template.base.VariableDoesNotExist errors # when rendering templates when standard Django user is logged in. request.user.backend = "django.contrib.auth.backends.ModelBackend" return { "swh_object_icons": swh_object_icons, "available_languages": None, "swh_client_config": config["client_config"], "oidc_enabled": bool(config["keycloak"]["server_url"]), "browsers_supported_image_mimes": browsers_supported_image_mimes, "keycloak": config["keycloak"], "site_base_url": request.build_absolute_uri("/"), "DJANGO_SETTINGS_MODULE": os.environ["DJANGO_SETTINGS_MODULE"], "status": config["status"], "swh_web_dev": is_swh_web_development(request), "swh_web_staging": is_swh_web_staging(request), "swh_web_prod": is_swh_web_production(request), "swh_web_version": get_distribution("swh.web").version, "iframe_mode": False, "ADMIN_LIST_DEPOSIT_PERMISSION": ADMIN_LIST_DEPOSIT_PERMISSION, "ADD_FORGE_MODERATOR_PERMISSION": ADD_FORGE_MODERATOR_PERMISSION, "MAILMAP_ADMIN_PERMISSION": MAILMAP_ADMIN_PERMISSION, "lang": "en", "sidebar_state": request.COOKIES.get("sidebar-state", "expanded"), "SWH_DJANGO_APPS": settings.SWH_DJANGO_APPS, } def resolve_branch_alias( snapshot: Dict[str, Any], branch: Optional[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: """ Resolve branch alias in snapshot content. Args: snapshot: a full snapshot content branch: a branch alias contained in the snapshot Returns: The real snapshot branch that got aliased. """ while branch and branch["target_type"] == "alias": if branch["target"] in snapshot["branches"]: branch = snapshot["branches"][branch["target"]] else: from swh.web.utils import archive snp = archive.lookup_snapshot( snapshot["id"], branches_from=branch["target"], branches_count=1 ) if snp and branch["target"] in snp["branches"]: branch = snp["branches"][branch["target"]] else: branch = None return branch class _NoHeaderHTMLTranslator(HTMLTranslator): """ Docutils translator subclass to customize the generation of HTML from reST-formatted docstrings """ def __init__(self, document): super().__init__(document) self.body_prefix = [] self.body_suffix = [] _HTML_WRITER = Writer() _HTML_WRITER.translator_class = _NoHeaderHTMLTranslator def rst_to_html(rst: str) -> str: """ Convert reStructuredText document into HTML. Args: rst: A string containing a reStructuredText document Returns: Body content of the produced HTML conversion. """ settings = { "initial_header_level": 2, "halt_level": 4, "traceback": True, "file_insertion_enabled": False, "raw_enabled": False, } pp = publish_parts(rst, writer=_HTML_WRITER, settings_overrides=settings) return f'
{pp["html_body"]}
' def prettify_html(html: str) -> str: """ Prettify an HTML document. Args: html: Input HTML document Returns: The prettified HTML document """ return BeautifulSoup(html, "lxml").prettify() def django_cache( timeout: int = DEFAULT_TIMEOUT, catch_exception: bool = False, exception_return_value: Any = None, invalidate_cache_pred: Callable[[Any], bool] = lambda val: False, ): """Decorator to put the result of a function call in Django cache, subsequent calls will directly return the cached value. Args: timeout: The number of seconds value will be hold in cache catch_exception: If :const:`True`, any thrown exception by the decorated function will be caught and not reraised exception_return_value: The value to return if previous parameter is set to :const:`True` invalidate_cache_pred: A predicate function enabling to invalidate the cache under certain conditions, decorated function will then be called again Returns: The returned value of the decorated function for the specified parameters """ def inner(func): @functools.wraps(func) def wrapper(*args, **kwargs): func_args = args + (0,) + tuple(sorted(kwargs.items())) cache_key = str(hash((func.__module__, func.__name__) + func_args)) ret = cache.get(cache_key) if ret is None or invalidate_cache_pred(ret): try: ret = func(*args, **kwargs) except Exception as exc: if catch_exception: sentry_capture_exception(exc) return exception_return_value else: raise else: cache.set(cache_key, ret, timeout=timeout) return ret return wrapper return inner def _deposits_list_url( deposits_list_base_url: str, page_size: int, username: Optional[str] ) -> str: params = {"page_size": str(page_size)} if username is not None: params["username"] = username return f"{deposits_list_base_url}?{urllib.parse.urlencode(params)}" def get_deposits_list(username: Optional[str] = None) -> List[Dict[str, Any]]: """Return the list of software deposits using swh-deposit API""" config = get_config()["deposit"] private_api_url = config["private_api_url"].rstrip("/") + "/" deposits_list_base_url = private_api_url + "deposits" deposits_list_auth = HTTPBasicAuth( config["private_api_user"], config["private_api_password"] ) deposits_list_url = _deposits_list_url( deposits_list_base_url, page_size=1, username=username ) nb_deposits = requests.get( deposits_list_url, auth=deposits_list_auth, timeout=30 ).json()["count"] @django_cache(invalidate_cache_pred=lambda data: data["count"] != nb_deposits) def _get_deposits_data(): deposits_list_url = _deposits_list_url( deposits_list_base_url, page_size=nb_deposits, username=username ) return requests.get( deposits_list_url, auth=deposits_list_auth, timeout=30, ).json() deposits_data = _get_deposits_data() return deposits_data["results"] _origin_visit_types_cache_timeout = 24 * 60 * 60 # 24 hours @django_cache( timeout=_origin_visit_types_cache_timeout, catch_exception=True, exception_return_value=[], ) def origin_visit_types() -> List[str]: """Return the exhaustive list of visit types for origins ingested into the archive. """ return sorted(search().visit_types_count().keys()) def redirect_to_new_route(request, new_route, permanent=True): """Redirect a request to another route with url args and query parameters eg: /origin//log?path=test can be redirected as /log?url=&path=test. This can be used to deprecate routes """ request_path = resolve(request.path_info) args = {**request_path.kwargs, **request.GET.dict()} return redirect( reverse(new_route, query_params=args), permanent=permanent, )