Changeset View
Changeset View
Standalone View
Standalone View
swh/web/common/origin_save.py
# Copyright (C) 2018-2020 The Software Heritage developers | # Copyright (C) 2018-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from bisect import bisect_right | from bisect import bisect_right | ||||
from datetime import datetime, timedelta, timezone | from datetime import datetime, timedelta, timezone | ||||
from itertools import product | from itertools import product | ||||
import json | import json | ||||
import logging | import logging | ||||
from typing import Any, Dict | from typing import Any, Dict, List, Optional, Tuple | ||||
from prometheus_client import Gauge | from prometheus_client import Gauge | ||||
import requests | import requests | ||||
import sentry_sdk | import sentry_sdk | ||||
from django.core.exceptions import ObjectDoesNotExist, ValidationError | from django.core.exceptions import ObjectDoesNotExist, ValidationError | ||||
from django.core.validators import URLValidator | from django.core.validators import URLValidator | ||||
from django.db.models import QuerySet | |||||
from django.utils.html import escape | from django.utils.html import escape | ||||
from swh.scheduler.utils import create_oneshot_task_dict | from swh.scheduler.utils import create_oneshot_task_dict | ||||
from swh.web import config | from swh.web import config | ||||
from swh.web.common import archive | from swh.web.common import archive | ||||
from swh.web.common.exc import BadInputExc, ForbiddenExc, NotFoundExc | from swh.web.common.exc import BadInputExc, ForbiddenExc, NotFoundExc | ||||
from swh.web.common.models import ( | from swh.web.common.models import ( | ||||
SAVE_REQUEST_ACCEPTED, | SAVE_REQUEST_ACCEPTED, | ||||
SAVE_REQUEST_PENDING, | SAVE_REQUEST_PENDING, | ||||
SAVE_REQUEST_REJECTED, | SAVE_REQUEST_REJECTED, | ||||
SAVE_TASK_FAILED, | SAVE_TASK_FAILED, | ||||
SAVE_TASK_NOT_CREATED, | SAVE_TASK_NOT_CREATED, | ||||
SAVE_TASK_NOT_YET_SCHEDULED, | SAVE_TASK_NOT_YET_SCHEDULED, | ||||
SAVE_TASK_RUNNING, | SAVE_TASK_RUNNING, | ||||
SAVE_TASK_SCHEDULED, | SAVE_TASK_SCHEDULED, | ||||
SAVE_TASK_SUCCEEDED, | SAVE_TASK_SUCCEEDED, | ||||
SaveAuthorizedOrigin, | SaveAuthorizedOrigin, | ||||
SaveOriginRequest, | SaveOriginRequest, | ||||
SaveUnauthorizedOrigin, | SaveUnauthorizedOrigin, | ||||
) | ) | ||||
from swh.web.common.origin_visits import get_origin_visits | from swh.web.common.origin_visits import get_origin_visits | ||||
from swh.web.common.typing import OriginInfo | |||||
from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc | from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc | ||||
scheduler = config.scheduler() | scheduler = config.scheduler() | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def get_origin_save_authorized_urls(): | def get_origin_save_authorized_urls() -> List[str]: | ||||
""" | """ | ||||
Get the list of origin url prefixes authorized to be | Get the list of origin url prefixes authorized to be | ||||
immediately loaded into the archive (whitelist). | immediately loaded into the archive (whitelist). | ||||
Returns: | Returns: | ||||
list: The list of authorized origin url prefix | list: The list of authorized origin url prefix | ||||
""" | """ | ||||
return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] | return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] | ||||
def get_origin_save_unauthorized_urls(): | def get_origin_save_unauthorized_urls() -> List[str]: | ||||
""" | """ | ||||
Get the list of origin url prefixes forbidden to be | Get the list of origin url prefixes forbidden to be | ||||
loaded into the archive (blacklist). | loaded into the archive (blacklist). | ||||
Returns: | Returns: | ||||
list: the list of unauthorized origin url prefix | list: the list of unauthorized origin url prefix | ||||
""" | """ | ||||
return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] | return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] | ||||
def can_save_origin(origin_url): | def can_save_origin(origin_url: str) -> str: | ||||
""" | """ | ||||
Check if a software origin can be saved into the archive. | Check if a software origin can be saved into the archive. | ||||
Based on the origin url, the save request will be either: | Based on the origin url, the save request will be either: | ||||
* immediately accepted if the url is whitelisted | * immediately accepted if the url is whitelisted | ||||
* rejected if the url is blacklisted | * rejected if the url is blacklisted | ||||
* put in pending state for manual review otherwise | * put in pending state for manual review otherwise | ||||
Show All 39 Lines | _save_task_run_status = { | ||||
"eventful": SAVE_TASK_SUCCEEDED, | "eventful": SAVE_TASK_SUCCEEDED, | ||||
"uneventful": SAVE_TASK_SUCCEEDED, | "uneventful": SAVE_TASK_SUCCEEDED, | ||||
"failed": SAVE_TASK_FAILED, | "failed": SAVE_TASK_FAILED, | ||||
"permfailed": SAVE_TASK_FAILED, | "permfailed": SAVE_TASK_FAILED, | ||||
"lost": SAVE_TASK_FAILED, | "lost": SAVE_TASK_FAILED, | ||||
} | } | ||||
def get_savable_visit_types(): | def get_savable_visit_types() -> List[str]: | ||||
return sorted(list(_visit_type_task.keys())) | |||||
def _check_visit_type_savable(visit_type): | |||||
""" | """ | ||||
Get the list of visit types that can be performed | Get the list of visit types that can be performed | ||||
through a save request. | through a save request. | ||||
Returns: | Returns: | ||||
list: the list of saveable visit types | list: the list of saveable visit types | ||||
""" | """ | ||||
return sorted(list(_visit_type_task.keys())) | |||||
def _check_visit_type_savable(visit_type: str) -> None: | |||||
allowed_visit_types = ", ".join(get_savable_visit_types()) | allowed_visit_types = ", ".join(get_savable_visit_types()) | ||||
if visit_type not in _visit_type_task: | if visit_type not in _visit_type_task: | ||||
raise BadInputExc( | raise BadInputExc( | ||||
"Visit of type %s can not be saved! " | "Visit of type %s can not be saved! " | ||||
"Allowed types are the following: %s" % (visit_type, allowed_visit_types) | "Allowed types are the following: %s" % (visit_type, allowed_visit_types) | ||||
) | ) | ||||
_validate_url = URLValidator(schemes=["http", "https", "svn", "git"]) | _validate_url = URLValidator(schemes=["http", "https", "svn", "git"]) | ||||
def _check_origin_url_valid(origin_url): | def _check_origin_url_valid(origin_url: str) -> None: | ||||
try: | try: | ||||
_validate_url(origin_url) | _validate_url(origin_url) | ||||
except ValidationError: | except ValidationError: | ||||
raise BadInputExc( | raise BadInputExc( | ||||
"The provided origin url (%s) is not valid!" % escape(origin_url) | "The provided origin url (%s) is not valid!" % escape(origin_url) | ||||
) | ) | ||||
def _get_visit_info_for_save_request(save_request): | def _get_visit_info_for_save_request( | ||||
save_request: SaveOriginRequest, | |||||
) -> Tuple[Optional[datetime], Optional[str]]: | |||||
visit_date = None | visit_date = None | ||||
visit_status = None | visit_status = None | ||||
time_now = datetime.now(tz=timezone.utc) | time_now = datetime.now(tz=timezone.utc) | ||||
time_delta = time_now - save_request.request_date | time_delta = time_now - save_request.request_date | ||||
# stop trying to find a visit date one month after save request submission | # stop trying to find a visit date one month after save request submission | ||||
# as those requests to storage are expensive and associated loading task | # as those requests to storage are expensive and associated loading task | ||||
# surely ended up with errors | # surely ended up with errors | ||||
if time_delta.days <= 30: | if time_delta.days <= 30: | ||||
try: | try: | ||||
origin = {"url": save_request.origin_url} | origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url)) | ||||
origin_info = archive.lookup_origin(origin) | |||||
origin_visits = get_origin_visits(origin_info) | origin_visits = get_origin_visits(origin_info) | ||||
visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits] | visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits] | ||||
i = bisect_right(visit_dates, save_request.request_date) | i = bisect_right(visit_dates, save_request.request_date) | ||||
if i != len(visit_dates): | if i != len(visit_dates): | ||||
visit_date = visit_dates[i] | visit_date = visit_dates[i] | ||||
visit_status = origin_visits[i]["status"] | visit_status = origin_visits[i]["status"] | ||||
if origin_visits[i]["status"] not in ("full", "partial", "not_found"): | if origin_visits[i]["status"] not in ("full", "partial", "not_found"): | ||||
visit_date = None | visit_date = None | ||||
except Exception as exc: | except Exception as exc: | ||||
sentry_sdk.capture_exception(exc) | sentry_sdk.capture_exception(exc) | ||||
return visit_date, visit_status | return visit_date, visit_status | ||||
def _check_visit_update_status(save_request, save_task_status): | def _check_visit_update_status( | ||||
save_request: SaveOriginRequest, save_task_status: str | |||||
) -> Tuple[Optional[datetime], str]: | |||||
visit_date, visit_status = _get_visit_info_for_save_request(save_request) | visit_date, visit_status = _get_visit_info_for_save_request(save_request) | ||||
save_request.visit_date = visit_date | save_request.visit_date = visit_date | ||||
# visit has been performed, mark the saving task as succeed | # visit has been performed, mark the saving task as succeed | ||||
if visit_date and visit_status is not None: | if visit_date and visit_status is not None: | ||||
save_task_status = SAVE_TASK_SUCCEEDED | save_task_status = SAVE_TASK_SUCCEEDED | ||||
elif visit_status in ("created", "ongoing"): | elif visit_status in ("created", "ongoing"): | ||||
save_task_status = SAVE_TASK_RUNNING | save_task_status = SAVE_TASK_RUNNING | ||||
elif visit_status in ("not_found", "failed"): | elif visit_status in ("not_found", "failed"): | ||||
save_task_status = SAVE_TASK_FAILED | save_task_status = SAVE_TASK_FAILED | ||||
else: | else: | ||||
time_now = datetime.now(tz=timezone.utc) | time_now = datetime.now(tz=timezone.utc) | ||||
time_delta = time_now - save_request.request_date | time_delta = time_now - save_request.request_date | ||||
# consider the task as failed if it is still in scheduled state | # consider the task as failed if it is still in scheduled state | ||||
# 30 days after its submission | # 30 days after its submission | ||||
if time_delta.days > 30: | if time_delta.days > 30: | ||||
save_task_status = SAVE_TASK_FAILED | save_task_status = SAVE_TASK_FAILED | ||||
return visit_date, save_task_status | return visit_date, save_task_status | ||||
def _save_request_dict(save_request, task=None, task_run=None): | def _save_request_dict( | ||||
save_request: SaveOriginRequest, | |||||
task: Optional[Dict[str, Any]] = None, | |||||
task_run: Optional[Dict[str, Any]] = None, | |||||
) -> Dict[str, Any]: | |||||
must_save = False | must_save = False | ||||
visit_date = save_request.visit_date | visit_date = save_request.visit_date | ||||
# save task still in scheduler db | # save task still in scheduler db | ||||
if task: | if task: | ||||
save_task_status = _save_task_status[task["status"]] | save_task_status = _save_task_status[task["status"]] | ||||
if task_run: | if task_run: | ||||
save_task_status = _save_task_run_status[task_run["status"]] | save_task_status = _save_task_run_status[task_run["status"]] | ||||
# Consider request from which a visit date has already been found | # Consider request from which a visit date has already been found | ||||
Show All 39 Lines | return { | ||||
"origin_url": save_request.origin_url, | "origin_url": save_request.origin_url, | ||||
"save_request_date": save_request.request_date.isoformat(), | "save_request_date": save_request.request_date.isoformat(), | ||||
"save_request_status": save_request.status, | "save_request_status": save_request.status, | ||||
"save_task_status": save_task_status, | "save_task_status": save_task_status, | ||||
"visit_date": visit_date.isoformat() if visit_date else None, | "visit_date": visit_date.isoformat() if visit_date else None, | ||||
} | } | ||||
def create_save_origin_request(visit_type, origin_url): | def create_save_origin_request(visit_type: str, origin_url: str) -> Dict[str, Any]: | ||||
""" | """ | ||||
Create a loading task to save a software origin into the archive. | Create a loading task to save a software origin into the archive. | ||||
This function aims to create a software origin loading task | This function aims to create a software origin loading task | ||||
trough the use of the swh-scheduler component. | trough the use of the swh-scheduler component. | ||||
First, some checks are performed to see if the visit type and origin | First, some checks are performed to see if the visit type and origin | ||||
url are valid but also if the the save request can be accepted. | url are valid but also if the the save request can be accepted. | ||||
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines | def create_save_origin_request(visit_type: str, origin_url: str) -> Dict[str, Any]: | ||||
if save_request_status == SAVE_REQUEST_REJECTED: | if save_request_status == SAVE_REQUEST_REJECTED: | ||||
raise ForbiddenExc( | raise ForbiddenExc( | ||||
( | ( | ||||
'The "save code now" request has been rejected ' | 'The "save code now" request has been rejected ' | ||||
"because the provided origin url is blacklisted." | "because the provided origin url is blacklisted." | ||||
) | ) | ||||
) | ) | ||||
assert sor is not None | |||||
return _save_request_dict(sor, task) | return _save_request_dict(sor, task) | ||||
def get_save_origin_requests_from_queryset(requests_queryset): | def get_save_origin_requests_from_queryset( | ||||
requests_queryset: QuerySet, | |||||
) -> List[Dict[str, Any]]: | |||||
""" | """ | ||||
Get all save requests from a SaveOriginRequest queryset. | Get all save requests from a SaveOriginRequest queryset. | ||||
Args: | Args: | ||||
requests_queryset (django.db.models.QuerySet): input | requests_queryset (django.db.models.QuerySet): input | ||||
SaveOriginRequest queryset | SaveOriginRequest queryset | ||||
Returns: | Returns: | ||||
Show All 12 Lines | if task_ids: | ||||
for sor in requests_queryset: | for sor in requests_queryset: | ||||
sr_dict = _save_request_dict( | sr_dict = _save_request_dict( | ||||
sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id) | sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id) | ||||
) | ) | ||||
save_requests.append(sr_dict) | save_requests.append(sr_dict) | ||||
return save_requests | return save_requests | ||||
def get_save_origin_requests(visit_type, origin_url): | def get_save_origin_requests(visit_type: str, origin_url: str) -> List[Dict[str, Any]]: | ||||
""" | """ | ||||
Get all save requests for a given software origin. | Get all save requests for a given software origin. | ||||
Args: | Args: | ||||
visit_type (str): the type of visit | visit_type (str): the type of visit | ||||
origin_url (str): the url of the origin | origin_url (str): the url of the origin | ||||
Raises: | Raises: | ||||
▲ Show 20 Lines • Show All 166 Lines • ▼ Show 20 Lines | |||||
_accepted_save_requests_gauge = Gauge( | _accepted_save_requests_gauge = Gauge( | ||||
name=ACCEPTED_SAVE_REQUESTS_METRIC, | name=ACCEPTED_SAVE_REQUESTS_METRIC, | ||||
documentation="Number of accepted origin save requests", | documentation="Number of accepted origin save requests", | ||||
labelnames=["load_task_status", "visit_type"], | labelnames=["load_task_status", "visit_type"], | ||||
registry=SWH_WEB_METRICS_REGISTRY, | registry=SWH_WEB_METRICS_REGISTRY, | ||||
) | ) | ||||
def compute_save_requests_metrics(): | def compute_save_requests_metrics() -> None: | ||||
"""Compute a couple of Prometheus metrics related to | """Compute a couple of Prometheus metrics related to | ||||
origin save requests""" | origin save requests""" | ||||
request_statuses = ( | request_statuses = ( | ||||
SAVE_REQUEST_ACCEPTED, | SAVE_REQUEST_ACCEPTED, | ||||
SAVE_REQUEST_REJECTED, | SAVE_REQUEST_REJECTED, | ||||
SAVE_REQUEST_PENDING, | SAVE_REQUEST_PENDING, | ||||
) | ) | ||||
Show All 31 Lines |