diff --git a/assets/src/bundles/save/index.js b/assets/src/bundles/save/index.js index 06876012..d2c54ac7 100644 --- a/assets/src/bundles/save/index.js +++ b/assets/src/bundles/save/index.js @@ -1,567 +1,561 @@ /** * Copyright (C) 2018-2021 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU Affero General Public License version 3, or any later version * See top-level LICENSE file for more information */ import {csrfPost, handleFetchError, isGitRepoUrl, htmlAlert, removeUrlFragment, getCanonicalOriginURL} from 'utils/functions'; import {swhSpinnerSrc} from 'utils/constants'; import artifactFormRowTemplate from './artifact-form-row.ejs'; let saveRequestsTable; async function originSaveRequest( originType, originUrl, extraData, acceptedCallback, pendingCallback, errorCallback ) { // Actually trigger the origin save request const addSaveOriginRequestUrl = Urls.api_1_save_origin(originType, originUrl); $('.swh-processing-save-request').css('display', 'block'); let headers = {}; let body = null; if (extraData !== {}) { body = JSON.stringify(extraData); headers = { 'Content-Type': 'application/json' }; }; try { const response = await csrfPost(addSaveOriginRequestUrl, headers, body); handleFetchError(response); const data = await response.json(); $('.swh-processing-save-request').css('display', 'none'); if (data.save_request_status === 'accepted') { acceptedCallback(); } else { pendingCallback(); } } catch (response) { $('.swh-processing-save-request').css('display', 'none'); const errorData = await response.json(); errorCallback(response.status, errorData); }; } function addArtifactVersionAutofillHandler(formId) { // autofill artifact version input with the filename from // the artifact url without extensions $(`#swh-input-artifact-url-${formId}`).on('input', function(event) { const artifactUrl = $(this).val().trim(); let filename = artifactUrl.split('/').slice(-1)[0]; if (filename !== artifactUrl) { filename = filename.replace(/tar.*$/, 'tar'); const filenameNoExt = filename.split('.').slice(0, -1).join('.'); const artifactVersion = $(`#swh-input-artifact-version-${formId}`); if (filenameNoExt !== filename) { artifactVersion.val(filenameNoExt); } } }); } export function maybeRequireExtraInputs() { // Read the actual selected value and depending on the origin type, display some extra // inputs or hide them. This makes the extra inputs disabled when not displayed. const originType = $('#swh-input-visit-type').val(); let display = 'none'; let disabled = true; if (originType === 'archives') { display = 'flex'; disabled = false; } $('.swh-save-origin-archives-form').css('display', display); if (!disabled) { // help paragraph must have block display for proper rendering $('#swh-save-origin-archives-help').css('display', 'block'); } $('.swh-save-origin-archives-form .form-control').prop('disabled', disabled); if (originType === 'archives' && $('.swh-save-origin-archives-form').length === 1) { // insert first artifact row when the archives visit type is selected for the first time $('.swh-save-origin-archives-form').last().after( artifactFormRowTemplate({deletableRow: false, formId: 0})); addArtifactVersionAutofillHandler(0); } } export function addArtifactFormRow() { const formId = $('.swh-save-origin-artifact-form').length; $('.swh-save-origin-artifact-form').last().after( artifactFormRowTemplate({ deletableRow: true, formId: formId }) ); addArtifactVersionAutofillHandler(formId); } export function deleteArtifactFormRow(event) { $(event.target).closest('.swh-save-origin-artifact-form').remove(); } const userRequestsFilterCheckbox = `
`; export function initOriginSave() { - $(document).ready(async() => { + $(document).ready(() => { $.fn.dataTable.ext.errMode = 'none'; - const response = await fetch(Urls.origin_save_types_list()); - const data = await response.json(); - - for (const originType of data) { - $('#swh-input-visit-type').append(``); - } // set git as the default value as before $('#swh-input-visit-type').val('git'); saveRequestsTable = $('#swh-origin-save-requests') .on('error.dt', (e, settings, techNote, message) => { $('#swh-origin-save-request-list-error').text('An error occurred while retrieving the save requests list'); console.log(message); }) .DataTable({ serverSide: true, processing: true, language: { processing: `` }, ajax: { url: Urls.origin_save_requests_list('all'), data: (d) => { if (swh.webapp.isUserLoggedIn() && $('#swh-save-requests-user-filter').prop('checked')) { d.user_requests_only = '1'; } } }, searchDelay: 1000, // see https://datatables.net/examples/advanced_init/dom_toolbar.html and the comments section // this option customizes datatables UI components by adding an extra checkbox above the table // while keeping bootstrap layout dom: '<"row"<"col-sm-3"l><"col-sm-6 text-left user-requests-filter"><"col-sm-3"f>>' + '<"row"<"col-sm-12"tr>>' + '<"row"<"col-sm-5"i><"col-sm-7"p>>', fnInitComplete: function() { if (swh.webapp.isUserLoggedIn()) { $('div.user-requests-filter').html(userRequestsFilterCheckbox); $('#swh-save-requests-user-filter').on('change', () => { saveRequestsTable.draw(); }); } }, columns: [ { data: 'save_request_date', name: 'request_date', render: (data, type, row) => { if (type === 'display') { const date = new Date(data); return date.toLocaleString(); } return data; } }, { data: 'visit_type', name: 'visit_type' }, { data: 'origin_url', name: 'origin_url', render: (data, type, row) => { if (type === 'display') { let html = ''; const sanitizedURL = $.fn.dataTable.render.text().display(data); if (row.save_task_status === 'succeeded') { let browseOriginUrl = `${Urls.browse_origin()}?origin_url=${encodeURIComponent(sanitizedURL)}`; if (row.visit_date) { browseOriginUrl += `&timestamp=${encodeURIComponent(row.visit_date)}`; } html += `${sanitizedURL}`; } else { html += sanitizedURL; } html += ` ` + ''; return html; } return data; } }, { data: 'save_request_status', name: 'status' }, { data: 'save_task_status', name: 'loading_task_status' }, { name: 'info', render: (data, type, row) => { if (row.save_task_status === 'succeeded' || row.save_task_status === 'failed') { return ``; } else { return ''; } } }, { render: (data, type, row) => { if (row.save_request_status === 'accepted') { const saveAgainButton = ''; return saveAgainButton; } else { return ''; } } } ], scrollY: '50vh', scrollCollapse: true, order: [[0, 'desc']], responsive: { details: { type: 'none' } } }); swh.webapp.addJumpToPagePopoverToDataTable(saveRequestsTable); $('#swh-origin-save-requests-list-tab').on('shown.bs.tab', () => { saveRequestsTable.draw(); window.location.hash = '#requests'; }); $('#swh-origin-save-request-help-tab').on('shown.bs.tab', () => { removeUrlFragment(); $('.swh-save-request-info').popover('dispose'); }); const saveRequestAcceptedAlert = htmlAlert( 'success', 'The "save code now" request has been accepted and will be processed as soon as possible.', true ); const saveRequestPendingAlert = htmlAlert( 'warning', 'The "save code now" request has been put in pending state and may be accepted for processing after manual review.', true ); const saveRequestRateLimitedAlert = htmlAlert( 'danger', 'The rate limit for "save code now" requests has been reached. Please try again later.', true ); const saveRequestUnknownErrorAlert = htmlAlert( 'danger', 'An unexpected error happened when submitting the "save code now request".', true ); $('#swh-save-origin-form').submit(async event => { event.preventDefault(); event.stopPropagation(); $('.alert').alert('close'); if (event.target.checkValidity()) { $(event.target).removeClass('was-validated'); const originType = $('#swh-input-visit-type').val(); let originUrl = $('#swh-input-origin-url').val(); originUrl = await getCanonicalOriginURL(originUrl); // read the extra inputs for the 'archives' type const extraData = {}; if (originType === 'archives') { extraData['archives_data'] = []; for (let i = 0; i < $('.swh-save-origin-artifact-form').length; ++i) { extraData['archives_data'].push({ 'artifact_url': $(`#swh-input-artifact-url-${i}`).val(), 'artifact_version': $(`#swh-input-artifact-version-${i}`).val() }); } } originSaveRequest(originType, originUrl, extraData, () => $('#swh-origin-save-request-status').html(saveRequestAcceptedAlert), () => $('#swh-origin-save-request-status').html(saveRequestPendingAlert), (statusCode, errorData) => { $('#swh-origin-save-request-status').css('color', 'red'); if (statusCode === 403) { const errorAlert = htmlAlert('danger', `Error: ${errorData['reason']}`); $('#swh-origin-save-request-status').html(errorAlert); } else if (statusCode === 429) { $('#swh-origin-save-request-status').html(saveRequestRateLimitedAlert); } else if (statusCode === 400) { const errorAlert = htmlAlert('danger', errorData['reason']); $('#swh-origin-save-request-status').html(errorAlert); } else { $('#swh-origin-save-request-status').html(saveRequestUnknownErrorAlert); } }); } else { $(event.target).addClass('was-validated'); } }); $('#swh-show-origin-save-requests-list').on('click', (event) => { event.preventDefault(); $('.nav-tabs a[href="#swh-origin-save-requests-list"]').tab('show'); }); $('#swh-input-origin-url').on('input', function(event) { const originUrl = $(this).val().trim(); $(this).val(originUrl); $('#swh-input-visit-type option').each(function() { const val = $(this).val(); if (val && originUrl.includes(val)) { $(this).prop('selected', true); } }); }); if (window.location.hash === '#requests') { $('.nav-tabs a[href="#swh-origin-save-requests-list"]').tab('show'); } }); } export function validateSaveOriginUrl(input) { const originType = $('#swh-input-visit-type').val(); let originUrl = null; let validUrl = true; try { originUrl = new URL(input.value.trim()); } catch (TypeError) { validUrl = false; } if (validUrl) { const allowedProtocols = ['http:', 'https:', 'svn:', 'git:']; validUrl = ( allowedProtocols.find(protocol => protocol === originUrl.protocol) !== undefined ); } if (validUrl && originType === 'git') { // additional checks for well known code hosting providers switch (originUrl.hostname) { case 'github.com': validUrl = isGitRepoUrl(originUrl); break; case 'git.code.sf.net': validUrl = isGitRepoUrl(originUrl, '/p/'); break; case 'bitbucket.org': validUrl = isGitRepoUrl(originUrl); break; default: if (originUrl.hostname.startsWith('gitlab.')) { validUrl = isGitRepoUrl(originUrl); } break; } } if (validUrl) { input.setCustomValidity(''); } else { input.setCustomValidity('The origin url is not valid or does not reference a code repository'); } } export function initTakeNewSnapshot() { const newSnapshotRequestAcceptedAlert = htmlAlert( 'success', 'The "take new snapshot" request has been accepted and will be processed as soon as possible.', true ); const newSnapshotRequestPendingAlert = htmlAlert( 'warning', 'The "take new snapshot" request has been put in pending state and may be accepted for processing after manual review.', true ); const newSnapshotRequestRateLimitAlert = htmlAlert( 'danger', 'The rate limit for "take new snapshot" requests has been reached. Please try again later.', true ); const newSnapshotRequestUnknownErrorAlert = htmlAlert( 'danger', 'An unexpected error happened when submitting the "save code now request".', true ); $(document).ready(() => { $('#swh-take-new-snapshot-form').submit(event => { event.preventDefault(); event.stopPropagation(); const originType = $('#swh-input-visit-type').val(); const originUrl = $('#swh-input-origin-url').val(); const extraData = {}; originSaveRequest(originType, originUrl, extraData, () => $('#swh-take-new-snapshot-request-status').html(newSnapshotRequestAcceptedAlert), () => $('#swh-take-new-snapshot-request-status').html(newSnapshotRequestPendingAlert), (statusCode, errorData) => { $('#swh-take-new-snapshot-request-status').css('color', 'red'); if (statusCode === 403) { const errorAlert = htmlAlert('danger', `Error: ${errorData['detail']}`, true); $('#swh-take-new-snapshot-request-status').html(errorAlert); } else if (statusCode === 429) { $('#swh-take-new-snapshot-request-status').html(newSnapshotRequestRateLimitAlert); } else { $('#swh-take-new-snapshot-request-status').html(newSnapshotRequestUnknownErrorAlert); } }); }); }); } export function formatValuePerType(type, value) { // Given some typed value, format and return accordingly formatted value const mapFormatPerTypeFn = { 'json': (v) => JSON.stringify(v, null, 2), 'date': (v) => new Date(v).toLocaleString(), 'raw': (v) => v, 'duration': (v) => v + ' seconds' }; return value === null ? null : mapFormatPerTypeFn[type](value); } export async function displaySaveRequestInfo(event, saveRequestId) { event.stopPropagation(); const saveRequestTaskInfoUrl = Urls.origin_save_task_info(saveRequestId); // close popover when clicking again on the info icon if ($(event.target).data('bs.popover')) { $(event.target).popover('dispose'); return; } $('.swh-save-request-info').popover('dispose'); $(event.target).popover({ animation: false, boundary: 'viewport', container: 'body', title: 'Save request task information ' + '`, content: `

Fetching task information ...

`, html: true, placement: 'left', sanitizeFn: swh.webapp.filterXSS }); $(event.target).on('shown.bs.popover', function() { const popoverId = $(this).attr('aria-describedby'); $(`#${popoverId} .mdi-close`).click(() => { $(this).popover('dispose'); }); }); $(event.target).popover('show'); const response = await fetch(saveRequestTaskInfoUrl); const saveRequestTaskInfo = await response.json(); let content; if ($.isEmptyObject(saveRequestTaskInfo)) { content = 'Not available'; } else { const saveRequestInfo = []; const taskData = { 'Type': ['raw', 'type'], 'Visit status': ['raw', 'visit_status'], 'Arguments': ['json', 'arguments'], 'Id': ['raw', 'id'], 'Backend id': ['raw', 'backend_id'], 'Scheduling date': ['date', 'scheduled'], 'Start date': ['date', 'started'], 'Completion date': ['date', 'ended'], 'Duration': ['duration', 'duration'], 'Runner': ['raw', 'worker'], 'Log': ['raw', 'message'] }; for (const [title, [type, property]] of Object.entries(taskData)) { if (saveRequestTaskInfo.hasOwnProperty(property)) { saveRequestInfo.push({ key: title, value: formatValuePerType(type, saveRequestTaskInfo[property]) }); } } content = ''; for (const info of saveRequestInfo) { content += ``; } content += '
'; } $('.swh-popover').html(content); $(event.target).popover('update'); } export function fillSaveRequestFormAndScroll(visitType, originUrl) { $('#swh-input-origin-url').val(originUrl); let originTypeFound = false; $('#swh-input-visit-type option').each(function() { const val = $(this).val(); if (val && originUrl.includes(val)) { $(this).prop('selected', true); originTypeFound = true; } }); if (!originTypeFound) { $('#swh-input-visit-type option').each(function() { const val = $(this).val(); if (val === visitType) { $(this).prop('selected', true); } }); } window.scrollTo(0, 0); } diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py index 6d69f4a0..66102d63 100644 --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,901 +1,911 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from bisect import bisect_right from datetime import datetime, timedelta, timezone +from functools import lru_cache from itertools import product import json import logging from typing import Any, Dict, List, Optional, Tuple from prometheus_client import Gauge import requests import sentry_sdk from django.core.exceptions import ObjectDoesNotExist, ValidationError from django.core.validators import URLValidator from django.db.models import Q, QuerySet from django.utils.html import escape from swh.scheduler.utils import create_oneshot_task_dict -from swh.web import config from swh.web.common import archive from swh.web.common.exc import BadInputExc, ForbiddenExc, NotFoundExc from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.common.origin_visits import get_origin_visits from swh.web.common.typing import ( OriginExistenceCheckInfo, OriginInfo, SaveOriginRequestInfo, ) from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc - -scheduler = config.scheduler() +from swh.web.config import get_config, scheduler logger = logging.getLogger(__name__) # Number of days in the past to lookup for information MAX_THRESHOLD_DAYS = 30 # Non terminal visit statuses which needs updates NON_TERMINAL_STATUSES = [ VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, ] def get_origin_save_authorized_urls() -> List[str]: """ Get the list of origin url prefixes authorized to be immediately loaded into the archive (whitelist). Returns: list: The list of authorized origin url prefix """ return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] def get_origin_save_unauthorized_urls() -> List[str]: """ Get the list of origin url prefixes forbidden to be loaded into the archive (blacklist). Returns: list: the list of unauthorized origin url prefix """ return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] def can_save_origin(origin_url: str, bypass_pending_review: bool = False) -> str: """ Check if a software origin can be saved into the archive. Based on the origin url, the save request will be either: * immediately accepted if the url is whitelisted * rejected if the url is blacklisted * put in pending state for manual review otherwise Args: origin_url (str): the software origin url to check Returns: str: the origin save request status, either **accepted**, **rejected** or **pending** """ # origin url may be blacklisted for url_prefix in get_origin_save_unauthorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_REJECTED # if the origin url is in the white list, it can be immediately saved for url_prefix in get_origin_save_authorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_ACCEPTED # otherwise, the origin url needs to be manually verified if the user # that submitted it does not have special permission if bypass_pending_review: # mark the origin URL as trusted in that case SaveAuthorizedOrigin.objects.get_or_create(url=origin_url) return SAVE_REQUEST_ACCEPTED else: return SAVE_REQUEST_PENDING # map visit type to scheduler task # TODO: do not hardcode the task name here (T1157) _visit_type_task = {"git": "load-git", "hg": "load-hg", "svn": "load-svn"} _visit_type_task_privileged = { "archives": "load-archive-files", } # map scheduler task status to origin save status _save_task_status = { "next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED, "next_run_scheduled": SAVE_TASK_SCHEDULED, "completed": SAVE_TASK_SUCCEEDED, "disabled": SAVE_TASK_FAILED, } # map scheduler task_run status to origin save status _save_task_run_status = { "scheduled": SAVE_TASK_SCHEDULED, "started": SAVE_TASK_RUNNING, "eventful": SAVE_TASK_SUCCEEDED, "uneventful": SAVE_TASK_SUCCEEDED, "failed": SAVE_TASK_FAILED, "permfailed": SAVE_TASK_FAILED, "lost": SAVE_TASK_FAILED, } +@lru_cache() +def get_scheduler_load_task_types() -> List[str]: + task_types = scheduler().get_task_types() + return [t["type"] for t in task_types if t["type"].startswith("load")] + + def get_savable_visit_types_dict(privileged_user: bool = False) -> Dict: """Returned the supported task types the user has access to. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the dict of supported visit types for the user """ if privileged_user: task_types = {**_visit_type_task, **_visit_type_task_privileged} else: task_types = _visit_type_task - return task_types + # scheduler is not available when running cypress tests + if get_config().get("e2e_tests_mode"): + return task_types + else: + load_task_types = get_scheduler_load_task_types() + return {k: v for k, v in task_types.items() if v in load_task_types} def get_savable_visit_types(privileged_user: bool = False) -> List[str]: """Return the list of visit types the user can perform save requests on. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the list of saveable visit types """ return sorted(list(get_savable_visit_types_dict(privileged_user).keys())) def _check_visit_type_savable(visit_type: str, privileged_user: bool = False) -> None: visit_type_tasks = get_savable_visit_types(privileged_user) if visit_type not in visit_type_tasks: allowed_visit_types = ", ".join(visit_type_tasks) raise BadInputExc( f"Visit of type {visit_type} can not be saved! " f"Allowed types are the following: {allowed_visit_types}" ) _validate_url = URLValidator(schemes=["http", "https", "svn", "git"]) def _check_origin_url_valid(origin_url: str) -> None: try: _validate_url(origin_url) except ValidationError: raise BadInputExc( "The provided origin url (%s) is not valid!" % escape(origin_url) ) def origin_exists(origin_url: str) -> OriginExistenceCheckInfo: """Check the origin url for existence. If it exists, extract some more useful information on the origin. """ resp = requests.head(origin_url, allow_redirects=True) exists = resp.ok content_length: Optional[int] = None last_modified: Optional[str] = None if exists: # Also process X-Archive-Orig-* headers in case the URL targets the # Internet Archive. size_ = resp.headers.get( "Content-Length", resp.headers.get("X-Archive-Orig-Content-Length") ) content_length = int(size_) if size_ else None try: date_str = resp.headers.get( "Last-Modified", resp.headers.get("X-Archive-Orig-Last-Modified", "") ) date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z") last_modified = date.isoformat() except ValueError: # if not provided or not parsable as per the expected format, keep it None pass return OriginExistenceCheckInfo( origin_url=origin_url, exists=exists, last_modified=last_modified, content_length=content_length, ) def _check_origin_exists(url: str) -> OriginExistenceCheckInfo: """Ensure an URL exists, if not raise an explicit message.""" metadata = origin_exists(url) if not metadata["exists"]: raise BadInputExc(f"The provided url ({escape(url)}) does not exist!") return metadata def _get_visit_info_for_save_request( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str]]: """Retrieve visit information out of a save request Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (visit date, optional visit status) for such save request origin """ visit_date = None visit_status = None time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # stop trying to find a visit date one month after save request submission # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: try: origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url)) origin_visits = get_origin_visits(origin_info) visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits] i = bisect_right(visit_dates, save_request.request_date) if i != len(visit_dates): visit_date = visit_dates[i] visit_status = origin_visits[i]["status"] except Exception as exc: sentry_sdk.capture_exception(exc) return visit_date, visit_status def _check_visit_update_status( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (optional visit date, optional visit status, optional save task status) for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: loading_task_status = SAVE_TASK_FAILED return visit_date, visit_status, loading_task_status def _compute_task_loading_status( task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> Optional[str]: loading_task_status: Optional[str] = None # First determine the loading task status out of task information if task: loading_task_status = _save_task_status[task["status"]] if task_run: loading_task_status = _save_task_run_status[task_run["status"]] return loading_task_status def _update_save_request_info( save_request: SaveOriginRequest, task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> SaveOriginRequestInfo: """Update save request information out of the visit status and fallback to the task and task_run information if the visit status is missing. Args: save_request: Save request task: Associated scheduler task information about the save request task_run: Most recent run occurrence of the associated task Returns: Summary of the save request information updated. """ must_save = False # To determine the save code now request's final status, the visit date must be set # and the visit status must be a final one. Once they do, the save code now is # definitely done. if ( not save_request.visit_date or not save_request.visit_status or save_request.visit_status in NON_TERMINAL_STATUSES ): visit_date, visit_status, loading_task_status = _check_visit_update_status( save_request ) if not loading_task_status: # fallback when not provided loading_task_status = _compute_task_loading_status(task, task_run) if visit_date != save_request.visit_date: must_save = True save_request.visit_date = visit_date if visit_status != save_request.visit_status: must_save = True save_request.visit_status = visit_status if ( loading_task_status is not None and loading_task_status != save_request.loading_task_status ): must_save = True save_request.loading_task_status = loading_task_status if must_save: save_request.save() return save_request.to_dict() def create_save_origin_request( visit_type: str, origin_url: str, privileged_user: bool = False, user_id: Optional[int] = None, **kwargs, ) -> SaveOriginRequestInfo: """Create a loading task to save a software origin into the archive. This function aims to create a software origin loading task trough the use of the swh-scheduler component. First, some checks are performed to see if the visit type and origin url are valid but also if the the save request can be accepted. For the 'archives' visit type, this also ensures the artifacts actually exists. If those checks passed, the loading task is then created. Otherwise, the save request is put in pending or rejected state. All the submitted save requests are logged into the swh-web database to keep track of them. Args: visit_type: the type of visit to perform (e.g. git, hg, svn, archives, ...) origin_url: the url of the origin to save privileged: Whether the user has some more privilege than other (bypass review, access to privileged other visit types) user_id: User identifier (provided when authenticated) kwargs: Optional parameters (e.g. artifact_url, artifact_filename, artifact_version) Raises: BadInputExc: the visit type or origin url is invalid or inexistent ForbiddenExc: the provided origin url is blacklisted Returns: dict: A dict describing the save request with the following keys: * **visit_type**: the type of visit to perform * **origin_url**: the url of the origin * **save_request_date**: the date the request was submitted * **save_request_status**: the request status, either **accepted**, **rejected** or **pending** * **save_task_status**: the origin loading task status, either **not created**, **not yet scheduled**, **scheduled**, **succeed** or **failed** """ visit_type_tasks = get_savable_visit_types_dict(privileged_user) _check_visit_type_savable(visit_type, privileged_user) _check_origin_url_valid(origin_url) # if all checks passed so far, we can try and save the origin save_request_status = can_save_origin(origin_url, privileged_user) task = None # if the origin save request is accepted, create a scheduler # task to load it into the archive if save_request_status == SAVE_REQUEST_ACCEPTED: # create a task with high priority task_kwargs: Dict[str, Any] = { "priority": "high", "url": origin_url, } if visit_type == "archives": # extra arguments for that type are required archives_data = kwargs.get("archives_data", []) if not archives_data: raise BadInputExc( "Artifacts data are missing for the archives visit type." ) artifacts = [] for artifact in archives_data: artifact_url = artifact.get("artifact_url") artifact_version = artifact.get("artifact_version") if not artifact_url or not artifact_version: raise BadInputExc("Missing url or version for an artifact to load.") metadata = _check_origin_exists(artifact_url) artifacts.append( { "url": artifact_url, "version": artifact_version, "time": metadata["last_modified"], "length": metadata["content_length"], } ) task_kwargs = dict(**task_kwargs, artifacts=artifacts, snapshot_append=True) sor = None # get list of previously submitted save requests (most recent first) current_sors = list( SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ).order_by("-request_date") ) can_create_task = False # if no save requests previously submitted, create the scheduler task if not current_sors: can_create_task = True else: # get the latest submitted save request sor = current_sors[0] # if it was in pending state, we need to create the scheduler task # and update the save request info in the database if sor.status == SAVE_REQUEST_PENDING: can_create_task = True # a task has already been created to load the origin elif sor.loading_task_id != -1: # get the scheduler task and its status - tasks = scheduler.get_tasks([sor.loading_task_id]) + tasks = scheduler().get_tasks([sor.loading_task_id]) task = tasks[0] if tasks else None - task_runs = scheduler.get_task_runs([sor.loading_task_id]) + task_runs = scheduler().get_task_runs([sor.loading_task_id]) task_run = task_runs[0] if task_runs else None save_request_info = _update_save_request_info(sor, task, task_run) task_status = save_request_info["save_task_status"] # create a new scheduler task only if the previous one has been # already executed if ( task_status == SAVE_TASK_FAILED or task_status == SAVE_TASK_SUCCEEDED ): can_create_task = True sor = None else: can_create_task = False if can_create_task: # effectively create the scheduler task task_dict = create_oneshot_task_dict( visit_type_tasks[visit_type], **task_kwargs ) - task = scheduler.create_tasks([task_dict])[0] + task = scheduler().create_tasks([task_dict])[0] # pending save request has been accepted if sor: sor.status = SAVE_REQUEST_ACCEPTED sor.loading_task_id = task["id"] sor.save() else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, loading_task_id=task["id"], user_ids=f'"{user_id}"' if user_id else None, ) # save request must be manually reviewed for acceptation elif save_request_status == SAVE_REQUEST_PENDING: # check if there is already such a save request already submitted, # no need to add it to the database in that case try: sor = SaveOriginRequest.objects.get( visit_type=visit_type, origin_url=origin_url, status=save_request_status ) user_ids = sor.user_ids if sor.user_ids is not None else "" if user_id is not None and f'"{user_id}"' not in user_ids: # update user ids list sor.user_ids = f'{sor.user_ids},"{user_id}"' sor.save() # if not add it to the database except ObjectDoesNotExist: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, ) # origin can not be saved as its url is blacklisted, # log the request to the database anyway else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, ) if save_request_status == SAVE_REQUEST_REJECTED: raise ForbiddenExc( ( 'The "save code now" request has been rejected ' "because the provided origin url is blacklisted." ) ) assert sor is not None return _update_save_request_info(sor, task) def update_save_origin_requests_from_queryset( requests_queryset: QuerySet, ) -> List[SaveOriginRequestInfo]: """Update all save requests from a SaveOriginRequest queryset, update their status in db and return the list of impacted save_requests. Args: requests_queryset: input SaveOriginRequest queryset Returns: list: A list of save origin request info dicts as described in :func:`swh.web.common.origin_save.create_save_origin_request` """ task_ids = [] for sor in requests_queryset: task_ids.append(sor.loading_task_id) save_requests = [] if task_ids: - tasks = scheduler.get_tasks(task_ids) + tasks = scheduler().get_tasks(task_ids) tasks = {task["id"]: task for task in tasks} - task_runs = scheduler.get_task_runs(tasks) + task_runs = scheduler().get_task_runs(tasks) task_runs = {task_run["task"]: task_run for task_run in task_runs} for sor in requests_queryset: sr_dict = _update_save_request_info( sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id), ) save_requests.append(sr_dict) return save_requests def refresh_save_origin_request_statuses() -> List[SaveOriginRequestInfo]: """Refresh non-terminal save origin requests (SOR) in the backend. Non-terminal SOR are requests whose status is **accepted** and their task status are either **created**, **not yet scheduled**, **scheduled** or **running**. This shall compute this list of SOR, checks their status in the scheduler and optionally elasticsearch for their current status. Then update those in db. Finally, this returns the refreshed information on those SOR. """ pivot_date = datetime.now(tz=timezone.utc) - timedelta(days=MAX_THRESHOLD_DAYS) save_requests = SaveOriginRequest.objects.filter( # Retrieve accepted request statuses (all statuses) Q(status=SAVE_REQUEST_ACCEPTED), # those without the required information we need to update Q(visit_date__isnull=True) | Q(visit_status__isnull=True) | Q(visit_status__in=NON_TERMINAL_STATUSES), # limit results to recent ones (that is roughly 30 days old at best) Q(request_date__gte=pivot_date), ) return ( update_save_origin_requests_from_queryset(save_requests) if save_requests.count() > 0 else [] ) def get_save_origin_requests( visit_type: str, origin_url: str ) -> List[SaveOriginRequestInfo]: """ Get all save requests for a given software origin. Args: visit_type: the type of visit origin_url: the url of the origin Raises: BadInputExc: the visit type or origin url is invalid swh.web.common.exc.NotFoundExc: no save requests can be found for the given origin Returns: list: A list of save origin requests dict as described in :func:`swh.web.common.origin_save.create_save_origin_request` """ _check_visit_type_savable(visit_type) _check_origin_url_valid(origin_url) sors = SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ) if sors.count() == 0: raise NotFoundExc( f"No save requests found for visit of type {visit_type} " f"on origin with url {origin_url}." ) return update_save_origin_requests_from_queryset(sors) def get_save_origin_task_info( save_request_id: int, full_info: bool = True ) -> Dict[str, Any]: """ Get detailed information about an accepted save origin request and its associated loading task. If the associated loading task info is archived and removed from the scheduler database, returns an empty dictionary. Args: save_request_id: identifier of a save origin request full_info: whether to return detailed info for staff users Returns: A dictionary with the following keys: - **type**: loading task type - **arguments**: loading task arguments - **id**: loading task database identifier - **backend_id**: loading task celery identifier - **scheduled**: loading task scheduling date - **ended**: loading task termination date - **status**: loading task execution status - **visit_status**: Actual visit status Depending on the availability of the task logs in the elasticsearch cluster of Software Heritage, the returned dictionary may also contain the following keys: - **name**: associated celery task name - **message**: relevant log message from task execution - **duration**: task execution time (only if it succeeded) - **worker**: name of the worker that executed the task """ try: save_request = SaveOriginRequest.objects.get(id=save_request_id) except ObjectDoesNotExist: return {} - task = scheduler.get_tasks([save_request.loading_task_id]) + task = scheduler().get_tasks([save_request.loading_task_id]) task = task[0] if task else None if task is None: return {} - task_run = scheduler.get_task_runs([task["id"]]) + task_run = scheduler().get_task_runs([task["id"]]) task_run = task_run[0] if task_run else None if task_run is None: return {} task_run["type"] = task["type"] task_run["arguments"] = task["arguments"] task_run["id"] = task_run["task"] del task_run["task"] del task_run["metadata"] # Enrich the task run with the loading visit status task_run["visit_status"] = save_request.visit_status - es_workers_index_url = config.get_config()["es_workers_index_url"] + es_workers_index_url = get_config()["es_workers_index_url"] if not es_workers_index_url: return task_run es_workers_index_url += "/_search" if save_request.visit_date: min_ts = save_request.visit_date max_ts = min_ts + timedelta(days=7) else: min_ts = save_request.request_date max_ts = min_ts + timedelta(days=MAX_THRESHOLD_DAYS) min_ts_unix = int(min_ts.timestamp()) * 1000 max_ts_unix = int(max_ts.timestamp()) * 1000 save_task_status = _save_task_status[task["status"]] priority = "3" if save_task_status == SAVE_TASK_FAILED else "6" query = { "bool": { "must": [ {"match_phrase": {"priority": {"query": priority}}}, {"match_phrase": {"swh_task_id": {"query": task_run["backend_id"]}}}, { "range": { "@timestamp": { "gte": min_ts_unix, "lte": max_ts_unix, "format": "epoch_millis", } } }, ] } } try: response = requests.post( es_workers_index_url, json={"query": query, "sort": ["@timestamp"]}, timeout=30, ) results = json.loads(response.text) if results["hits"]["total"]["value"] >= 1: task_run_info = results["hits"]["hits"][-1]["_source"] if "swh_logging_args_runtime" in task_run_info: duration = task_run_info["swh_logging_args_runtime"] task_run["duration"] = duration if "message" in task_run_info: task_run["message"] = task_run_info["message"] if "swh_logging_args_name" in task_run_info: task_run["name"] = task_run_info["swh_logging_args_name"] elif "swh_task_name" in task_run_info: task_run["name"] = task_run_info["swh_task_name"] if "hostname" in task_run_info: task_run["worker"] = task_run_info["hostname"] elif "host" in task_run_info: task_run["worker"] = task_run_info["host"] except Exception as exc: logger.warning("Request to Elasticsearch failed\n%s", exc) sentry_sdk.capture_exception(exc) if not full_info: for field in ("id", "backend_id", "worker"): # remove some staff only fields task_run.pop(field, None) if "message" in task_run and "Loading failure" in task_run["message"]: # hide traceback for non staff users, only display exception message_lines = task_run["message"].split("\n") message = "" for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] task_run["message"] = message return task_run SUBMITTED_SAVE_REQUESTS_METRIC = "swh_web_submitted_save_requests" _submitted_save_requests_gauge = Gauge( name=SUBMITTED_SAVE_REQUESTS_METRIC, documentation="Number of submitted origin save requests", labelnames=["status", "visit_type"], registry=SWH_WEB_METRICS_REGISTRY, ) ACCEPTED_SAVE_REQUESTS_METRIC = "swh_web_accepted_save_requests" _accepted_save_requests_gauge = Gauge( name=ACCEPTED_SAVE_REQUESTS_METRIC, documentation="Number of accepted origin save requests", labelnames=["load_task_status", "visit_type"], registry=SWH_WEB_METRICS_REGISTRY, ) # Metric on the delay of save code now request per status and visit_type. This is the # time difference between the save code now is requested and the time it got ingested. ACCEPTED_SAVE_REQUESTS_DELAY_METRIC = "swh_web_save_requests_delay_seconds" _accepted_save_requests_delay_gauge = Gauge( name=ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, documentation="Save Requests Duration", labelnames=["load_task_status", "visit_type"], registry=SWH_WEB_METRICS_REGISTRY, ) def compute_save_requests_metrics() -> None: """Compute Prometheus metrics related to origin save requests: - Number of submitted origin save requests - Number of accepted origin save requests - Save Code Now requests delay between request time and actual time of ingestion """ request_statuses = ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING, ) load_task_statuses = ( SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, ) # for metrics, we want access to all visit types visit_types = get_savable_visit_types(privileged_user=True) labels_set = product(request_statuses, visit_types) for labels in labels_set: _submitted_save_requests_gauge.labels(*labels).set(0) labels_set = product(load_task_statuses, visit_types) for labels in labels_set: _accepted_save_requests_gauge.labels(*labels).set(0) duration_load_task_statuses = ( SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED, ) for labels in product(duration_load_task_statuses, visit_types): _accepted_save_requests_delay_gauge.labels(*labels).set(0) for sor in SaveOriginRequest.objects.all(): if sor.status == SAVE_REQUEST_ACCEPTED: _accepted_save_requests_gauge.labels( load_task_status=sor.loading_task_status, visit_type=sor.visit_type, ).inc() _submitted_save_requests_gauge.labels( status=sor.status, visit_type=sor.visit_type ).inc() if ( sor.loading_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED) and sor.visit_date is not None and sor.request_date is not None ): delay = sor.visit_date.timestamp() - sor.request_date.timestamp() _accepted_save_requests_delay_gauge.labels( load_task_status=sor.loading_task_status, visit_type=sor.visit_type, ).inc(delay) diff --git a/swh/web/misc/origin_save.py b/swh/web/misc/origin_save.py index da7913d6..e15261b8 100644 --- a/swh/web/misc/origin_save.py +++ b/swh/web/misc/origin_save.py @@ -1,103 +1,97 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.conf.urls import url from django.core.paginator import Paginator from django.db.models import Q from django.http import JsonResponse from django.shortcuts import render from swh.web.auth.utils import privileged_user from swh.web.common.models import SaveOriginRequest from swh.web.common.origin_save import ( get_savable_visit_types, get_save_origin_task_info, ) def _origin_save_view(request): return render( request, "misc/origin-save.html", - {"heading": ("Request the saving of a software origin into the archive")}, + { + "heading": ("Request the saving of a software origin into the archive"), + "visit_types": get_savable_visit_types(privileged_user(request)), + }, ) -def _visit_save_types_list(request) -> JsonResponse: - """Return the list of supported visit types as json response - - """ - visit_types = get_savable_visit_types(privileged_user(request)) - return JsonResponse(visit_types, safe=False) - - def _origin_save_requests_list(request, status): if status != "all": save_requests = SaveOriginRequest.objects.filter(status=status) else: save_requests = SaveOriginRequest.objects.all() table_data = {} table_data["recordsTotal"] = save_requests.count() table_data["draw"] = int(request.GET["draw"]) search_value = request.GET["search[value]"] column_order = request.GET["order[0][column]"] field_order = request.GET["columns[%s][name]" % column_order] order_dir = request.GET["order[0][dir]"] if order_dir == "desc": field_order = "-" + field_order save_requests = save_requests.order_by(field_order) length = int(request.GET["length"]) page = int(request.GET["start"]) / length + 1 if search_value: save_requests = save_requests.filter( Q(status__icontains=search_value) | Q(loading_task_status__icontains=search_value) | Q(visit_type__icontains=search_value) | Q(origin_url__icontains=search_value) ) if ( int(request.GET.get("user_requests_only", "0")) and request.user.is_authenticated ): save_requests = save_requests.filter(user_ids__contains=f'"{request.user.id}"') table_data["recordsFiltered"] = save_requests.count() paginator = Paginator(save_requests, length) table_data["data"] = [sor.to_dict() for sor in paginator.page(page).object_list] return JsonResponse(table_data) def _save_origin_task_info(request, save_request_id): request_info = get_save_origin_task_info( save_request_id, full_info=request.user.is_staff ) for date_field in ("scheduled", "started", "ended"): if date_field in request_info and request_info[date_field] is not None: request_info[date_field] = request_info[date_field].isoformat() return JsonResponse(request_info) urlpatterns = [ url(r"^save/$", _origin_save_view, name="origin-save"), - url(r"^save/types/list/$", _visit_save_types_list, name="origin-save-types-list"), url( r"^save/requests/list/(?P.+)/$", _origin_save_requests_list, name="origin-save-requests-list", ), url( r"^save/task/info/(?P.+)/", _save_origin_task_info, name="origin-save-task-info", ), ] diff --git a/swh/web/templates/misc/origin-save.html b/swh/web/templates/misc/origin-save.html index 28b75679..60fa8d72 100644 --- a/swh/web/templates/misc/origin-save.html +++ b/swh/web/templates/misc/origin-save.html @@ -1,139 +1,141 @@ {% extends "../layout.html" %} {% comment %} Copyright (C) 2018-2021 The Software Heritage developers See the AUTHORS file at the top-level directory of this distribution License: GNU Affero General Public License version 3, or any later version See top-level LICENSE file for more information {% endcomment %} {% load render_bundle from webpack_loader %} {% load static %} {% block title %}{{ heading }} – Software Heritage archive{% endblock %} {% block header %} {% render_bundle 'save' %} {% endblock %} {% block navbar-content %}

Save code now

{% endblock %} {% block content %}

You can contribute to extend the content of the Software Heritage archive by submitting an origin save request. To do so, fill the required info in the form below:

{% csrf_token %}
The origin type must be specified
The origin url is not valid or does not reference a code repository

A "Save code now" request takes the following parameters:

  • Origin type: the type of version control system the software origin is using. Currently, the supported types are:
  • Origin url: the url of the remote repository for the software origin.
    In order to avoid saving errors from Software Heritage, you should provide the clone/checkout url as given by the provider hosting the software origin.
    It can easily be found in the web interface used to browse the software origin.
    For instance, if you want to save a git origin into the archive, you should check that the command $ git clone <origin_url>
    does not return an error before submitting a request.

Once submitted, your save request can either be:

  • accepted: a visit to the provided origin will then be scheduled by Software Heritage in order to load its content into the archive as soon as possible
  • rejected: the provided origin url is blacklisted and no visit will be scheduled
  • put in pending state: a manual review will then be performed in order to determine if the origin can be safely loaded or not into the archive

Once a save request has been accepted, you can follow its current status in the submitted save requests list.
If you submitted requests while authenticated, you will be able to only display your own requests.

Date Type Url Request Status Info

{% endblock %} diff --git a/swh/web/tests/admin/test_origin_save.py b/swh/web/tests/admin/test_origin_save.py index 3f53d264..e4e701cf 100644 --- a/swh/web/tests/admin/test_origin_save.py +++ b/swh/web/tests/admin/test_origin_save.py @@ -1,220 +1,192 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from urllib.parse import unquote import pytest from django.contrib.auth import get_user_model from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_NOT_YET_SCHEDULED, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.common.origin_save import can_save_origin from swh.web.common.utils import reverse from swh.web.tests.utils import check_http_get_response, check_http_post_response _user_name = "swh-web-admin" _user_mail = "admin@swh-web.org" _user_password = "..34~pounds~BEAUTY~march~63.." _authorized_origin_url = "https://scm.ourproject.org/anonscm/" _unauthorized_origin_url = "https://www.softwareheritage.org/" pytestmark = pytest.mark.django_db @pytest.fixture(autouse=True) def populated_db(): User = get_user_model() user = User.objects.create_user(_user_name, _user_mail, _user_password) user.is_staff = True user.save() SaveAuthorizedOrigin.objects.create(url=_authorized_origin_url) SaveUnauthorizedOrigin.objects.create(url=_unauthorized_origin_url) def check_not_login(client, url): login_url = reverse("login", query_params={"next": url}) resp = check_http_post_response(client, url, status_code=302) assert unquote(resp.url) == login_url def test_add_authorized_origin_url(client): authorized_url = "https://scm.adullact.net/anonscm/" assert can_save_origin(authorized_url) == SAVE_REQUEST_PENDING url = reverse( "admin-origin-save-add-authorized-url", url_args={"origin_url": authorized_url} ) check_not_login(client, url) assert can_save_origin(authorized_url) == SAVE_REQUEST_PENDING client.login(username=_user_name, password=_user_password) check_http_post_response(client, url, status_code=200) assert can_save_origin(authorized_url) == SAVE_REQUEST_ACCEPTED def test_remove_authorized_origin_url(client): assert can_save_origin(_authorized_origin_url) == SAVE_REQUEST_ACCEPTED url = reverse( "admin-origin-save-remove-authorized-url", url_args={"origin_url": _authorized_origin_url}, ) check_not_login(client, url) assert can_save_origin(_authorized_origin_url) == SAVE_REQUEST_ACCEPTED client.login(username=_user_name, password=_user_password) check_http_post_response(client, url, status_code=200) assert can_save_origin(_authorized_origin_url) == SAVE_REQUEST_PENDING def test_add_unauthorized_origin_url(client): unauthorized_url = "https://www.yahoo./" assert can_save_origin(unauthorized_url) == SAVE_REQUEST_PENDING url = reverse( "admin-origin-save-add-unauthorized-url", url_args={"origin_url": unauthorized_url}, ) check_not_login(client, url) assert can_save_origin(unauthorized_url) == SAVE_REQUEST_PENDING client.login(username=_user_name, password=_user_password) check_http_post_response(client, url, status_code=200) assert can_save_origin(unauthorized_url) == SAVE_REQUEST_REJECTED def test_remove_unauthorized_origin_url(client): assert can_save_origin(_unauthorized_origin_url) == SAVE_REQUEST_REJECTED url = reverse( "admin-origin-save-remove-unauthorized-url", url_args={"origin_url": _unauthorized_origin_url}, ) check_not_login(client, url) assert can_save_origin(_unauthorized_origin_url) == SAVE_REQUEST_REJECTED client.login(username=_user_name, password=_user_password) check_http_post_response(client, url, status_code=200) assert can_save_origin(_unauthorized_origin_url) == SAVE_REQUEST_PENDING -def test_accept_pending_save_request(client, mocker): - mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") +def test_accept_pending_save_request(client, swh_scheduler): + visit_type = "git" origin_url = "https://v2.pikacode.com/bthate/botlib.git" save_request_url = reverse( "api-1-save-origin", url_args={"visit_type": visit_type, "origin_url": origin_url}, ) response = check_http_post_response(client, save_request_url, status_code=200) assert response.data["save_request_status"] == SAVE_REQUEST_PENDING accept_request_url = reverse( "admin-origin-save-request-accept", url_args={"visit_type": visit_type, "origin_url": origin_url}, ) check_not_login(client, accept_request_url) - tasks_data = [ - { - "priority": "high", - "policy": "oneshot", - "type": "load-git", - "arguments": {"kwargs": {"repo_url": origin_url}, "args": []}, - "status": "next_run_not_scheduled", - "id": 1, - } - ] - - mock_scheduler.create_tasks.return_value = tasks_data - mock_scheduler.get_tasks.return_value = tasks_data - client.login(username=_user_name, password=_user_password) response = check_http_post_response(client, accept_request_url, status_code=200) response = check_http_get_response(client, save_request_url, status_code=200) assert response.data[0]["save_request_status"] == SAVE_REQUEST_ACCEPTED assert response.data[0]["save_task_status"] == SAVE_TASK_NOT_YET_SCHEDULED -def test_reject_pending_save_request(client, mocker): - mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") +def test_reject_pending_save_request(client, swh_scheduler): + visit_type = "git" origin_url = "https://wikipedia.com" save_request_url = reverse( "api-1-save-origin", url_args={"visit_type": visit_type, "origin_url": origin_url}, ) response = check_http_post_response(client, save_request_url, status_code=200) assert response.data["save_request_status"] == SAVE_REQUEST_PENDING reject_request_url = reverse( "admin-origin-save-request-reject", url_args={"visit_type": visit_type, "origin_url": origin_url}, ) check_not_login(client, reject_request_url) client.login(username=_user_name, password=_user_password) response = check_http_post_response(client, reject_request_url, status_code=200) - tasks_data = [ - { - "priority": "high", - "policy": "oneshot", - "type": "load-git", - "arguments": {"kwargs": {"repo_url": origin_url}, "args": []}, - "status": "next_run_not_scheduled", - "id": 1, - } - ] - - mock_scheduler.create_tasks.return_value = tasks_data - mock_scheduler.get_tasks.return_value = tasks_data - response = check_http_get_response(client, save_request_url, status_code=200) assert response.data[0]["save_request_status"] == SAVE_REQUEST_REJECTED def test_remove_save_request(client): sor = SaveOriginRequest.objects.create( visit_type="git", origin_url="https://wikipedia.com", status=SAVE_REQUEST_PENDING, ) assert SaveOriginRequest.objects.count() == 1 remove_request_url = reverse( "admin-origin-save-request-remove", url_args={"sor_id": sor.id} ) check_not_login(client, remove_request_url) client.login(username=_user_name, password=_user_password) check_http_post_response(client, remove_request_url, status_code=200) assert SaveOriginRequest.objects.count() == 0 diff --git a/swh/web/tests/api/views/test_origin_save.py b/swh/web/tests/api/views/test_origin_save.py index 9705c6b5..8e627f35 100644 --- a/swh/web/tests/api/views/test_origin_save.py +++ b/swh/web/tests/api/views/test_origin_save.py @@ -1,625 +1,543 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta +import uuid import pytest from django.contrib.auth.models import User from django.core.exceptions import ObjectDoesNotExist from django.utils import timezone from swh.web.auth.utils import SWH_AMBASSADOR_PERMISSION from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_FAILED, VISIT_STATUS_FULL, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.common.typing import OriginExistenceCheckInfo from swh.web.common.utils import reverse from swh.web.settings.tests import save_origin_rate_post from swh.web.tests.utils import ( check_api_get_responses, check_api_post_response, check_api_post_responses, ) pytestmark = pytest.mark.django_db @pytest.fixture(autouse=True) def populated_db(): SaveAuthorizedOrigin.objects.create(url="https://github.com/"), SaveAuthorizedOrigin.objects.create(url="https://gitlab.com/"), SaveUnauthorizedOrigin.objects.create(url="https://github.com/user/illegal_repo") SaveUnauthorizedOrigin.objects.create(url="https://gitlab.com/user_to_exclude") -def test_invalid_visit_type(api_client): +def test_invalid_visit_type(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={ "visit_type": "foo", "origin_url": "https://github.com/torvalds/linux", }, ) check_api_get_responses(api_client, url, status_code=400) -def test_invalid_origin_url(api_client): +def test_invalid_origin_url(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": "bar"} ) check_api_get_responses(api_client, url, status_code=400) def check_created_save_request_status( api_client, mocker, origin_url, expected_request_status, - scheduler_task_status=None, - scheduler_task_run_status=None, expected_task_status=None, visit_date=None, ): - mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") mock_origin_exists = mocker.patch("swh.web.common.origin_save.origin_exists") mock_origin_exists.return_value = OriginExistenceCheckInfo( origin_url=origin_url, exists=True, last_modified=None, content_length=None ) - if scheduler_task_status is None: - mock_scheduler.get_tasks.return_value = [] - else: - mock_scheduler.get_tasks.return_value = [ - { - "priority": "high", - "policy": "oneshot", - "type": "load-git", - "arguments": {"kwargs": {"repo_url": origin_url}, "args": []}, - "status": scheduler_task_status, - "id": 1, - } - ] - - if scheduler_task_run_status is None: - mock_scheduler.get_task_runs.return_value = [] - else: - mock_scheduler.get_task_runs.return_value = [ - { - "backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205", - "ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5), - "id": 1, - "metadata": {}, - "scheduled": datetime.now(tz=timezone.utc), - "started": None, - "status": scheduler_task_run_status, - "task": 1, - } - ] - - mock_scheduler.create_tasks.return_value = [ - { - "priority": "high", - "policy": "oneshot", - "type": "load-git", - "arguments": {"kwargs": {"repo_url": origin_url}, "args": []}, - "status": "next_run_not_scheduled", - "id": 1, - } - ] - url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url} ) mock_visit_date = mocker.patch( ("swh.web.common.origin_save._get_visit_info_for_save_request") ) mock_visit_date.return_value = (visit_date, None) if expected_request_status != SAVE_REQUEST_REJECTED: response = check_api_post_responses(api_client, url, data=None, status_code=200) assert response.data["save_request_status"] == expected_request_status assert response.data["save_task_status"] == expected_task_status else: check_api_post_responses(api_client, url, data=None, status_code=403) def check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, expected_request_status, expected_task_status, scheduler_task_status="next_run_not_scheduled", scheduler_task_run_status=None, visit_date=None, visit_status=None, ): - mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") - mock_scheduler.get_tasks.return_value = [ - { - "priority": "high", - "policy": "oneshot", - "type": "load-git", - "arguments": {"kwargs": {"repo_url": origin_url}, "args": []}, - "status": scheduler_task_status, - "id": 1, - } - ] - if scheduler_task_run_status is None: - mock_scheduler.get_task_runs.return_value = [] - else: - mock_scheduler.get_task_runs.return_value = [ - { - "backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205", - "ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5), - "id": 1, - "metadata": {}, - "scheduled": datetime.now(tz=timezone.utc), - "started": None, - "status": scheduler_task_run_status, - "task": 1, - } - ] + if expected_task_status != SAVE_TASK_NOT_CREATED: + task = dict(swh_scheduler.search_tasks()[0].items()) + backend_id = str(uuid.uuid4()) + + if scheduler_task_status != "next_run_not_scheduled": + swh_scheduler.schedule_task_run(task["id"], backend_id) + + if scheduler_task_run_status is not None: + swh_scheduler.start_task_run(backend_id) + task_run = dict( + swh_scheduler.end_task_run(backend_id, scheduler_task_run_status).items() + ) url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url} ) mock_visit_date = mocker.patch( ("swh.web.common.origin_save._get_visit_info_for_save_request") ) mock_visit_date.return_value = (visit_date, visit_status) response = check_api_get_responses(api_client, url, status_code=200) save_request_data = response.data[0] assert save_request_data["save_request_status"] == expected_request_status assert save_request_data["save_task_status"] == expected_task_status assert save_request_data["visit_status"] == visit_status - # Check that save task status is still available when - # the scheduler task has been archived - mock_scheduler.get_tasks.return_value = [] - response = check_api_get_responses(api_client, url, status_code=200) - save_request_data = response.data[0] - assert save_request_data["save_task_status"] == expected_task_status - assert save_request_data["visit_status"] == visit_status + if scheduler_task_run_status is not None: + # Check that save task status is still available when + # the scheduler task has been archived + swh_scheduler.delete_archived_tasks( + [{"task_id": task["id"], "task_run_id": task_run["id"]}] + ) + response = check_api_get_responses(api_client, url, status_code=200) + save_request_data = response.data[0] + assert save_request_data["save_task_status"] == expected_task_status + assert save_request_data["visit_status"] == visit_status -def test_save_request_rejected(api_client, mocker): +def test_save_request_rejected(api_client, mocker, swh_scheduler): origin_url = "https://github.com/user/illegal_repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_REJECTED, ) check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_REJECTED, expected_task_status=SAVE_TASK_NOT_CREATED, ) -def test_save_request_pending(api_client, mocker): +def test_save_request_pending(api_client, mocker, swh_scheduler): origin_url = "https://unkwownforge.com/user/repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_PENDING, expected_task_status=SAVE_TASK_NOT_CREATED, ) check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_PENDING, expected_task_status=SAVE_TASK_NOT_CREATED, ) -def test_save_request_succeed(api_client, mocker): +def test_save_request_scheduled(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, scheduler_task_status="next_run_scheduled", scheduler_task_run_status="scheduled", ) + + +def test_save_request_completed(api_client, mocker, swh_scheduler): + origin_url = "https://github.com/Kitware/CMake" + check_created_save_request_status( + api_client, + mocker, + origin_url, + expected_request_status=SAVE_REQUEST_ACCEPTED, + expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, + ) check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SUCCEEDED, scheduler_task_status="completed", scheduler_task_run_status="eventful", visit_date=None, ) + + +def test_save_request_completed_visit_status(api_client, mocker, swh_scheduler): + origin_url = "https://github.com/Kitware/CMake" + check_created_save_request_status( + api_client, + mocker, + origin_url, + expected_request_status=SAVE_REQUEST_ACCEPTED, + expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, + ) visit_date = datetime.now(tz=timezone.utc) + timedelta(hours=1) check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SUCCEEDED, scheduler_task_status="completed", scheduler_task_run_status="eventful", visit_date=visit_date, visit_status=VISIT_STATUS_FULL, ) -def test_save_request_failed(api_client, mocker): +def test_save_request_failed(api_client, mocker, swh_scheduler): origin_url = "https://gitlab.com/inkscape/inkscape" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, - origin_url, - expected_request_status=SAVE_REQUEST_ACCEPTED, - expected_task_status=SAVE_TASK_SCHEDULED, - scheduler_task_status="next_run_scheduled", - scheduler_task_run_status="scheduled", - ) - check_save_request_status( - api_client, - mocker, + swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_FAILED, scheduler_task_status="disabled", scheduler_task_run_status="failed", visit_status=VISIT_STATUS_FAILED, ) -def test_create_save_request_only_when_needed(api_client, mocker): +def test_create_save_request_no_duplicate(api_client, mocker, swh_scheduler): origin_url = "https://github.com/webpack/webpack" - SaveOriginRequest.objects.create( - visit_type="git", - origin_url=origin_url, - status=SAVE_REQUEST_ACCEPTED, - loading_task_id=56, - ) check_created_save_request_status( api_client, mocker, origin_url, - scheduler_task_status="next_run_not_scheduled", expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 1 - check_created_save_request_status( + check_save_request_status( api_client, mocker, + swh_scheduler, origin_url, - scheduler_task_status="next_run_scheduled", - scheduler_task_run_status="scheduled", expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, + scheduler_task_status="next_run_scheduled", + scheduler_task_run_status="scheduled", ) - sors = list( - SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) - ) - assert len(sors) == 1 - - visit_date = datetime.now(tz=timezone.utc) + timedelta(hours=1) - check_created_save_request_status( - api_client, - mocker, - origin_url, - scheduler_task_status="completed", - expected_request_status=SAVE_REQUEST_ACCEPTED, - expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, - visit_date=visit_date, - ) - sors = list( - SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) - ) - # check_api_post_responses sends two POST requests to check YAML and JSON response - assert len(sors) == 3 check_created_save_request_status( api_client, mocker, origin_url, - scheduler_task_status="disabled", expected_request_status=SAVE_REQUEST_ACCEPTED, - expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, + expected_task_status=SAVE_TASK_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) - assert len(sors) == 5 + assert len(sors) == 1 -def test_get_save_requests_unknown_origin(api_client): +def test_get_save_requests_unknown_origin(api_client, swh_scheduler): unknown_origin_url = "https://gitlab.com/foo/bar" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": unknown_origin_url}, ) response = check_api_get_responses(api_client, url, status_code=404) assert response.data == { "exception": "NotFoundExc", "reason": ( "No save requests found for visit of type git on origin with url %s." ) % unknown_origin_url, } _visit_type = "git" _origin_url = "https://github.com/python/cpython" def test_save_requests_rate_limit(api_client, mocker): create_save_origin_request = mocker.patch( "swh.web.api.views.origin_save.create_save_origin_request" ) def _save_request_dict(*args, **kwargs): return { "id": 1, "visit_type": _visit_type, "origin_url": _origin_url, "save_request_date": datetime.now().isoformat(), "save_request_status": SAVE_REQUEST_ACCEPTED, "save_task_status": SAVE_TASK_NOT_YET_SCHEDULED, "visit_date": None, "visit_status": None, } create_save_origin_request.side_effect = _save_request_dict url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) for _ in range(save_origin_rate_post): check_api_post_response(api_client, url, status_code=200) check_api_post_response(api_client, url, status_code=429) def test_save_request_form_server_error(api_client, mocker): create_save_origin_request = mocker.patch( "swh.web.api.views.origin_save.create_save_origin_request" ) create_save_origin_request.side_effect = Exception("Server error") url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) check_api_post_responses(api_client, url, status_code=500) @pytest.fixture def origin_to_review(): return "https://git.example.org/user/project" def test_create_save_request_pending_review_anonymous_user( - api_client, origin_to_review + api_client, origin_to_review, swh_scheduler ): url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_to_review}, ) response = check_api_post_responses(api_client, url, status_code=200) assert response.data["save_request_status"] == SAVE_REQUEST_PENDING with pytest.raises(ObjectDoesNotExist): SaveAuthorizedOrigin.objects.get(url=origin_to_review) def test_create_save_request_archives_with_ambassador_user( - api_client, origin_to_review, keycloak_oidc, mocker, requests_mock, + api_client, keycloak_oidc, requests_mock, swh_scheduler, ): + swh_scheduler.add_load_archive_task_type() keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") originUrl = "https://somewhere.org/simple" artifact_version = "1.2.3" artifact_filename = f"tarball-{artifact_version}.tar.gz" artifact_url = f"{originUrl}/{artifact_filename}" content_length = "100" last_modified = "Sun, 21 Aug 2011 16:26:32 GMT" requests_mock.head( artifact_url, status_code=200, headers={"content-length": content_length, "last-modified": last_modified,}, ) - mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") - mock_scheduler.get_task_runs.return_value = [] - mock_scheduler.create_tasks.return_value = [ - { - "id": 10, - "priority": "high", - "policy": "oneshot", - "status": "next_run_not_scheduled", - "type": "load-archive-files", - "arguments": { - "args": [], - "kwargs": { - "url": originUrl, - "artifacts": [ - { - "url": artifact_url, - "version": artifact_version, - "time": last_modified, - "length": content_length, - } - ], - }, - }, - }, - ] - url = reverse( "api-1-save-origin", url_args={"visit_type": "archives", "origin_url": originUrl,}, ) response = check_api_post_response( api_client, url, status_code=200, data={ "archives_data": [ {"artifact_url": artifact_url, "artifact_version": artifact_version,} ] }, ) assert response.data["save_request_status"] == SAVE_REQUEST_ACCEPTED assert SaveAuthorizedOrigin.objects.get(url=originUrl) def test_create_save_request_archives_missing_artifacts_data( - api_client, origin_to_review, keycloak_oidc, mocker, requests_mock, + api_client, keycloak_oidc, swh_scheduler ): + swh_scheduler.add_load_archive_task_type() + keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") originUrl = "https://somewhere.org/simple" url = reverse( "api-1-save-origin", url_args={"visit_type": "archives", "origin_url": originUrl,}, ) response = check_api_post_response(api_client, url, status_code=400, data={},) assert "Artifacts data are missing" in response.data["reason"] response = check_api_post_response( api_client, url, status_code=400, data={"archives_data": [{"artifact_url": "", "arttifact_version": "1.0"}]}, ) assert "Missing url or version for an artifact to load" in response.data["reason"] def test_create_save_request_archives_accepted_ambassador_user( - api_client, origin_to_review, keycloak_oidc, mocker + api_client, origin_to_review, keycloak_oidc, mocker, swh_scheduler ): keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") check_created_save_request_status( api_client, mocker, origin_to_review, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) assert SaveAuthorizedOrigin.objects.get(url=origin_to_review) -def test_create_save_request_anonymous_user_no_user_id(api_client): +def test_create_save_request_anonymous_user_no_user_id(api_client, swh_scheduler): origin_url = "https://some.git.hosters/user/repo" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, ) check_api_post_responses(api_client, url, status_code=200) sor = SaveOriginRequest.objects.get(origin_url=origin_url) assert sor.user_ids is None def test_create_save_request_authenticated_user_id( - api_client, origin_to_review, keycloak_oidc, mocker + api_client, keycloak_oidc, swh_scheduler ): oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") origin_url = "https://some.git.hosters/user/repo2" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, ) response = check_api_post_response(api_client, url, status_code=200) assert response.wsgi_request.user.id is not None user_id = str(response.wsgi_request.user.id) sor = SaveOriginRequest.objects.get(user_ids=f'"{user_id}"') assert sor.user_ids == f'"{user_id}"' -def test_create_pending_save_request_multiple_authenticated_users(api_client): +def test_create_pending_save_request_multiple_authenticated_users( + api_client, swh_scheduler +): origin_url = "https://some.git.hosters/user/repo3" first_user = User.objects.create_user(username="first_user", password="") second_user = User.objects.create_user(username="second_user", password="") url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, ) api_client.force_login(first_user) check_api_post_response(api_client, url, status_code=200) api_client.force_login(second_user) check_api_post_response(api_client, url, status_code=200) assert SaveOriginRequest.objects.get(user_ids__contains=f'"{first_user.id}"') assert SaveOriginRequest.objects.get(user_ids__contains=f'"{second_user.id}"') diff --git a/swh/web/tests/browse/views/test_origin.py b/swh/web/tests/browse/views/test_origin.py index 054e9423..19d52fea 100644 --- a/swh/web/tests/browse/views/test_origin.py +++ b/swh/web/tests/browse/views/test_origin.py @@ -1,1299 +1,1299 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re import string from hypothesis import given from django.utils.html import escape from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT from swh.model.model import ( OriginVisit, OriginVisitStatus, Snapshot, SnapshotBranch, TargetType, ) from swh.storage.utils import now from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.common.exc import NotFoundExc from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import ( format_utc_iso_date, gen_path_info, parse_iso8601_date_to_utc, reverse, ) from swh.web.tests.data import get_content, random_sha1 from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.strategies import ( new_origin, new_snapshot, origin, origin_with_multiple_visits, origin_with_pull_request_branches, origin_with_releases, ) from swh.web.tests.strategies import release as existing_release from swh.web.tests.strategies import revisions, unknown_revision, visit_dates from swh.web.tests.utils import check_html_get_response @given(origin_with_multiple_visits()) def test_origin_visits_browse(client, archive_data, origin): url = reverse("browse-origin-visits", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/origin-visits.html" ) visits = archive_data.origin_visit_get(origin["url"]) for v in visits: vdate = format_utc_iso_date(v["date"], "%Y-%m-%dT%H:%M:%SZ") browse_dir_url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "timestamp": vdate}, ) assert_contains(resp, browse_dir_url) _check_origin_link(resp, origin["url"]) @given(origin_with_multiple_visits()) -def test_origin_content_view(client, archive_data, origin): +def test_origin_content_view(client, archive_data, swh_scheduler, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) def _get_archive_data(visit_idx): snapshot = archive_data.snapshot_get(origin_visits[visit_idx]["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) dir_content = archive_data.directory_ls(head_rev["directory"]) dir_files = [e for e in dir_content if e["type"] == "file"] dir_file = random.choice(dir_files) branches, releases, _ = process_snapshot_branches(snapshot) return { "branches": branches, "releases": releases, "root_dir_sha1": head_rev["directory"], "content": get_content(dir_file["checksums"]["sha1"]), "visit": origin_visits[visit_idx], "snapshot_sizes": archive_data.snapshot_count_branches(snapshot["id"]), } tdata = _get_archive_data(-1) _origin_content_view_test_helper( client, archive_data, origin, origin_visits[-1], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], ) _origin_content_view_test_helper( client, archive_data, origin, origin_visits[-1], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], timestamp=tdata["visit"]["date"], ) _origin_content_view_test_helper( client, archive_data, origin, origin_visits[-1], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], snapshot_id=tdata["visit"]["snapshot"], ) tdata = _get_archive_data(0) _origin_content_view_test_helper( client, archive_data, origin, origin_visits[0], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], visit_id=tdata["visit"]["visit"], ) _origin_content_view_test_helper( client, archive_data, origin, origin_visits[0], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], snapshot_id=tdata["visit"]["snapshot"], ) @given(origin()) -def test_origin_root_directory_view(client, archive_data, origin): +def test_origin_root_directory_view(client, archive_data, swh_scheduler, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) root_dir_sha1 = head_rev["directory"] dir_content = archive_data.directory_ls(root_dir_sha1) branches, releases, _ = process_snapshot_branches(snapshot) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, snapshot_id=visit["snapshot"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, snapshot_id=visit["snapshot"], ) @given(origin()) -def test_origin_sub_directory_view(client, archive_data, origin): +def test_origin_sub_directory_view(client, archive_data, swh_scheduler, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) root_dir_sha1 = head_rev["directory"] subdirs = [ e for e in archive_data.directory_ls(root_dir_sha1) if e["type"] == "dir" ] branches, releases, _ = process_snapshot_branches(snapshot) if len(subdirs) == 0: return subdir = random.choice(subdirs) subdir_content = archive_data.directory_ls(subdir["target"]) subdir_path = subdir["name"] _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, snapshot_id=visit["snapshot"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, snapshot_id=visit["snapshot"], ) @given(origin()) def test_origin_branches(client, archive_data, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) snapshot_content = process_snapshot_branches(snapshot) _origin_branches_test_helper(client, origin, snapshot_content, snapshot_sizes) _origin_branches_test_helper( client, origin, snapshot_content, snapshot_sizes, snapshot_id=visit["snapshot"] ) @given(origin()) def test_origin_releases(client, archive_data, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) snapshot_content = process_snapshot_branches(snapshot) _origin_releases_test_helper(client, origin, snapshot_content, snapshot_sizes) _origin_releases_test_helper( client, origin, snapshot_content, snapshot_sizes, snapshot_id=visit["snapshot"] ) @given( new_origin(), new_snapshot(min_size=4, max_size=4), visit_dates(), revisions(min_size=3, max_size=3), ) def test_origin_snapshot_null_branch( client, archive_data, new_origin, new_snapshot, visit_dates, revisions ): snp_dict = new_snapshot.to_dict() archive_data.origin_add([new_origin]) for i, branch in enumerate(snp_dict["branches"].keys()): if i == 0: snp_dict["branches"][branch] = None else: snp_dict["branches"][branch] = { "target_type": "revision", "target": hash_to_bytes(revisions[i - 1]), } archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url} ) check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) @given( new_origin(), new_snapshot(min_size=4, max_size=4), visit_dates(), revisions(min_size=4, max_size=4), ) def test_origin_snapshot_invalid_branch( client, archive_data, new_origin, new_snapshot, visit_dates, revisions ): snp_dict = new_snapshot.to_dict() archive_data.origin_add([new_origin]) for i, branch in enumerate(snp_dict["branches"].keys()): snp_dict["branches"][branch] = { "target_type": "revision", "target": hash_to_bytes(revisions[i]), } archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": "invalid_branch"}, ) check_html_get_response(client, url, status_code=404, template_used="error.html") @given(new_origin()) def test_browse_visits_origin_not_found(client, new_origin): url = reverse("browse-origin-visits", query_params={"origin_url": new_origin.url}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains( resp, f"Origin with url {new_origin.url} not found", status_code=404 ) @given(origin()) def test_browse_origin_directory_no_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [] mock_archive = mocker.patch("swh.web.common.origin_visits.archive") mock_archive.lookup_origin_visit_latest.return_value = None url = reverse("browse-origin-directory", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains(resp, "No valid visit", status_code=404) assert not mock_get_origin_visits.called @given(origin()) def test_browse_origin_directory_unknown_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [{"visit": 1}] url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "visit_id": 2}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search("Visit.*not found", resp.content.decode("utf-8")) assert mock_get_origin_visits.called @given(origin()) def test_browse_origin_directory_not_found(client, origin): url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "path": "/invalid/dir/path/"}, ) resp = check_html_get_response( client, url, status_code=404, template_used="browse/directory.html" ) assert re.search("Directory.*not found", resp.content.decode("utf-8")) @given(origin()) def test_browse_origin_content_no_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [] mock_archive = mocker.patch("swh.web.common.origin_visits.archive") mock_archive.lookup_origin_visit_latest.return_value = None url = reverse( "browse-origin-content", query_params={"origin_url": origin["url"], "path": "foo"}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains(resp, "No valid visit", status_code=404) assert not mock_get_origin_visits.called @given(origin()) def test_browse_origin_content_unknown_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [{"visit": 1}] url = reverse( "browse-origin-content", query_params={"origin_url": origin["url"], "path": "foo", "visit_id": 2}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search("Visit.*not found", resp.content.decode("utf-8")) assert mock_get_origin_visits.called @given(origin()) def test_browse_origin_content_directory_empty_snapshot(client, mocker, origin): mock_snapshot_archive = mocker.patch("swh.web.browse.snapshot_context.archive") mock_get_origin_visit_snapshot = mocker.patch( "swh.web.browse.snapshot_context.get_origin_visit_snapshot" ) mock_get_origin_visit_snapshot.return_value = ([], [], {}) mock_snapshot_archive.lookup_origin.return_value = origin mock_snapshot_archive.lookup_snapshot_sizes.return_value = { "alias": 0, "revision": 0, "release": 0, } for browse_context in ("content", "directory"): url = reverse( f"browse-origin-{browse_context}", query_params={"origin_url": origin["url"], "path": "baz"}, ) resp = check_html_get_response( client, url, status_code=200, template_used=f"browse/{browse_context}.html" ) assert re.search("snapshot.*is empty", resp.content.decode("utf-8")) assert mock_get_origin_visit_snapshot.called assert mock_snapshot_archive.lookup_origin.called @given(origin()) def test_browse_origin_content_not_found(client, origin): url = reverse( "browse-origin-content", query_params={"origin_url": origin["url"], "path": "/invalid/file/path"}, ) resp = check_html_get_response( client, url, status_code=404, template_used="browse/content.html" ) assert re.search("Directory entry.*not found", resp.content.decode("utf-8")) @given(origin()) def test_browse_directory_snapshot_not_found(client, mocker, origin): mock_get_snapshot_context = mocker.patch( "swh.web.browse.snapshot_context.get_snapshot_context" ) mock_get_snapshot_context.side_effect = NotFoundExc("Snapshot not found") url = reverse("browse-origin-directory", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains(resp, "Snapshot not found", status_code=404) assert mock_get_snapshot_context.called @given(origin()) def test_origin_empty_snapshot(client, mocker, origin): mock_archive = mocker.patch("swh.web.browse.snapshot_context.archive") mock_get_origin_visit_snapshot = mocker.patch( "swh.web.browse.snapshot_context.get_origin_visit_snapshot" ) mock_get_origin_visit_snapshot.return_value = ([], [], {}) mock_archive.lookup_snapshot_sizes.return_value = { "alias": 0, "revision": 0, "release": 0, } mock_archive.lookup_origin.return_value = origin url = reverse("browse-origin-directory", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) resp_content = resp.content.decode("utf-8") assert re.search("snapshot.*is empty", resp_content) assert not re.search("swh-tr-link", resp_content) assert mock_get_origin_visit_snapshot.called @given(new_origin()) def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin): snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target="refs/head/master".encode(), target_type=TargetType.ALIAS, ), b"refs/head/master": None, } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=now(), type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) resp_content = resp.content.decode("utf-8") assert re.search("snapshot.*is empty", resp_content) assert not re.search("swh-tr-link", resp_content) @given(origin_with_releases()) def test_origin_release_browse(client, archive_data, origin): snapshot = archive_data.snapshot_get_latest(origin["url"]) release = [ b for b in snapshot["branches"].values() if b["target_type"] == "release" ][-1] release_data = archive_data.release_get(release["target"]) revision_data = archive_data.revision_get(release_data["target"]) url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "release": release_data["name"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, release_data["name"]) assert_contains(resp, release["target"]) swhid_context = { "origin": origin["url"], "visit": gen_swhid(SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(RELEASE, release_data["id"]), } swh_dir_id = gen_swhid( DIRECTORY, revision_data["directory"], metadata=swhid_context ) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) @given(origin_with_releases()) def test_origin_release_browse_not_found(client, origin): invalid_release_name = "swh-foo-bar" url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "release": invalid_release_name}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search( f"Release {invalid_release_name}.*not found", resp.content.decode("utf-8") ) @given(new_origin(), unknown_revision()) def test_origin_browse_directory_branch_with_non_resolvable_revision( client, archive_data, new_origin, unknown_revision ): branch_name = "master" snapshot = Snapshot( branches={ branch_name.encode(): SnapshotBranch( target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION, ) } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=now(), type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": branch_name}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains( resp, f"Revision {unknown_revision } could not be found in the archive." ) @given(origin()) def test_origin_content_no_path(client, origin): url = reverse("browse-origin-content", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=400, template_used="error.html" ) assert_contains( resp, "The path of a content must be given as query parameter.", status_code=400 ) def test_origin_views_no_url_query_parameter(client): for browse_context in ( "content", "directory", "log", "branches", "releases", "visits", ): url = reverse(f"browse-origin-{browse_context}") resp = check_html_get_response( client, url, status_code=400, template_used="error.html" ) assert_contains( resp, "An origin URL must be provided as query parameter.", status_code=400 ) def _origin_content_view_test_helper( client, archive_data, origin_info, origin_visit, snapshot_sizes, origin_branches, origin_releases, root_dir_sha1, content, visit_id=None, timestamp=None, snapshot_id=None, ): content_path = "/".join(content["path"].split("/")[1:]) if not visit_id and not snapshot_id: visit_id = origin_visit["visit"] query_params = {"origin_url": origin_info["url"], "path": content_path} if timestamp: query_params["timestamp"] = timestamp if visit_id: query_params["visit_id"] = visit_id elif snapshot_id: query_params["snapshot"] = snapshot_id url = reverse("browse-origin-content", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) assert type(content["data"]) == str assert_contains(resp, '' % content["hljs_language"]) assert_contains(resp, escape(content["data"])) split_path = content_path.split("/") filename = split_path[-1] path = content_path.replace(filename, "")[:-1] path_info = gen_path_info(path) del query_params["path"] if timestamp: query_params["timestamp"] = format_utc_iso_date( parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ" ) root_dir_url = reverse("browse-origin-directory", query_params=query_params) assert_contains(resp, '
  • ', count=len(path_info) + 1) assert_contains(resp, '%s' % (root_dir_url, root_dir_sha1[:7])) for p in path_info: query_params["path"] = p["path"] dir_url = reverse("browse-origin-directory", query_params=query_params) assert_contains(resp, '%s' % (dir_url, p["name"])) assert_contains(resp, "
  • %s
  • " % filename) query_string = "sha1_git:" + content["sha1_git"] url_raw = reverse( "browse-content-raw", url_args={"query_string": query_string}, query_params={"filename": filename}, ) assert_contains(resp, url_raw) if "path" in query_params: del query_params["path"] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) assert_contains(resp, f'href="{escape(origin_releases_url)}">') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '
  • ', count=len(origin_branches)) query_params["path"] = content_path for branch in origin_branches: root_dir_branch_url = reverse( "browse-origin-content", query_params={"branch": branch["name"], **query_params}, ) assert_contains(resp, '' % root_dir_branch_url) assert_contains(resp, '
  • ', count=len(origin_releases)) query_params["branch"] = None for release in origin_releases: root_dir_release_url = reverse( "browse-origin-content", query_params={"release": release["name"], **query_params}, ) assert_contains(resp, '' % root_dir_release_url) url = reverse("browse-origin-content", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) swhid_context = { "origin": origin_info["url"], "visit": gen_swhid(SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(REVISION, head_rev_id), "path": f"/{content_path}", } swh_cnt_id = gen_swhid(CONTENT, content["sha1_git"], metadata=swhid_context) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) assert_contains(resp, "swh-take-new-snapshot") _check_origin_link(resp, origin_info["url"]) assert_not_contains(resp, "swh-metadata-popover") def _origin_directory_view_test_helper( client, archive_data, origin_info, origin_visit, snapshot_sizes, origin_branches, origin_releases, root_directory_sha1, directory_entries, visit_id=None, timestamp=None, snapshot_id=None, path=None, ): dirs = [e for e in directory_entries if e["type"] in ("dir", "rev")] files = [e for e in directory_entries if e["type"] == "file"] if not visit_id and not snapshot_id: visit_id = origin_visit["visit"] query_params = {"origin_url": origin_info["url"]} if timestamp: query_params["timestamp"] = timestamp elif visit_id: query_params["visit_id"] = visit_id else: query_params["snapshot"] = snapshot_id if path: query_params["path"] = path url = reverse("browse-origin-directory", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, '', count=len(dirs)) assert_contains(resp, '', count=len(files)) if timestamp: query_params["timestamp"] = format_utc_iso_date( parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ" ) for d in dirs: if d["type"] == "rev": dir_url = reverse("browse-revision", url_args={"sha1_git": d["target"]}) else: dir_path = d["name"] if path: dir_path = "%s/%s" % (path, d["name"]) query_params["path"] = dir_path dir_url = reverse("browse-origin-directory", query_params=query_params,) assert_contains(resp, dir_url) for f in files: file_path = f["name"] if path: file_path = "%s/%s" % (path, f["name"]) query_params["path"] = file_path file_url = reverse("browse-origin-content", query_params=query_params) assert_contains(resp, file_url) if "path" in query_params: del query_params["path"] root_dir_branch_url = reverse("browse-origin-directory", query_params=query_params) nb_bc_paths = 1 if path: nb_bc_paths = len(path.split("/")) + 1 assert_contains(resp, '
  • ', count=nb_bc_paths) assert_contains( resp, '%s' % (root_dir_branch_url, root_directory_sha1[:7]) ) origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") if path: query_params["path"] = path assert_contains(resp, '
  • ', count=len(origin_branches)) for branch in origin_branches: query_params["branch"] = branch["name"] root_dir_branch_url = reverse( "browse-origin-directory", query_params=query_params ) assert_contains(resp, '' % root_dir_branch_url) assert_contains(resp, '
  • ', count=len(origin_releases)) query_params["branch"] = None for release in origin_releases: query_params["release"] = release["name"] root_dir_release_url = reverse( "browse-origin-directory", query_params=query_params ) assert_contains(resp, 'href="%s"' % root_dir_release_url) assert_contains(resp, "vault-cook-directory") assert_contains(resp, "vault-cook-revision") snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) swhid_context = { "origin": origin_info["url"], "visit": gen_swhid(SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(REVISION, head_rev_id), "path": f"/{path}" if path else None, } swh_dir_id = gen_swhid( DIRECTORY, directory_entries[0]["dir_id"], metadata=swhid_context ) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) assert_contains(resp, "swh-take-new-snapshot") _check_origin_link(resp, origin_info["url"]) assert_not_contains(resp, "swh-metadata-popover") def _origin_branches_test_helper( client, origin_info, origin_snapshot, snapshot_sizes, snapshot_id=None ): query_params = {"origin_url": origin_info["url"], "snapshot": snapshot_id} url = reverse("browse-origin-branches", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/branches.html" ) origin_branches = origin_snapshot[0] origin_releases = origin_snapshot[1] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}">') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '' % escape(browse_branch_url)) browse_revision_url = reverse( "browse-revision", url_args={"sha1_git": branch["revision"]}, query_params=query_params, ) assert_contains(resp, '' % escape(browse_revision_url)) _check_origin_link(resp, origin_info["url"]) def _origin_releases_test_helper( client, origin_info, origin_snapshot, snapshot_sizes, snapshot_id=None ): query_params = {"origin_url": origin_info["url"], "snapshot": snapshot_id} url = reverse("browse-origin-releases", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/releases.html" ) origin_releases = origin_snapshot[1] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']}") assert_contains(resp, '' % escape(browse_release_url)) assert_contains(resp, '' % escape(browse_revision_url)) _check_origin_link(resp, origin_info["url"]) @given( new_origin(), visit_dates(), revisions(min_size=10, max_size=10), existing_release() ) def test_origin_branches_pagination_with_alias( client, archive_data, mocker, new_origin, visit_dates, revisions, existing_release ): """ When a snapshot contains a branch or a release alias, pagination links in the branches / releases view should be displayed. """ mocker.patch("swh.web.browse.snapshot_context.PER_PAGE", len(revisions) / 2) snp_dict = {"branches": {}, "id": hash_to_bytes(random_sha1())} for i in range(len(revisions)): branch = "".join(random.choices(string.ascii_lowercase, k=8)) snp_dict["branches"][branch.encode()] = { "target_type": "revision", "target": hash_to_bytes(revisions[i]), } release = "".join(random.choices(string.ascii_lowercase, k=8)) snp_dict["branches"][b"RELEASE_ALIAS"] = { "target_type": "alias", "target": release.encode(), } snp_dict["branches"][release.encode()] = { "target_type": "release", "target": hash_to_bytes(existing_release), } archive_data.origin_add([new_origin]) archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse("browse-origin-branches", query_params={"origin_url": new_origin.url}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/branches.html" ) assert_contains(resp, '
      Newer') if len(revision_log_sorted) > per_page: assert_contains( resp, 'Older' % escape(next_page_url), ) for log in revision_log_sorted[:per_page]: revision_url = reverse("browse-revision", url_args={"sha1_git": log["id"]}) assert_contains(resp, log["id"][:7]) assert_contains(resp, log["author"]["name"]) assert_contains(resp, format_utc_iso_date(log["date"])) assert_contains(resp, escape(log["message"])) assert_contains(resp, format_utc_iso_date(log["committer_date"])) assert_contains(resp, revision_url) if len(revision_log_sorted) <= per_page: return resp = check_html_get_response( client, next_page_url, status_code=200, template_used="browse/revision-log.html" ) prev_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": 0, "per_page": per_page}, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": 2 * per_page, "per_page": per_page}, ) nb_log_entries = len(revision_log_sorted) - per_page if nb_log_entries > per_page: nb_log_entries = per_page assert_contains(resp, 'Newer' % escape(prev_page_url) ) if len(revision_log_sorted) > 2 * per_page: assert_contains( resp, 'Older' % escape(next_page_url), ) if len(revision_log_sorted) <= 2 * per_page: return resp = check_html_get_response( client, next_page_url, status_code=200, template_used="browse/revision-log.html" ) prev_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": per_page, "per_page": per_page}, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": 3 * per_page, "per_page": per_page}, ) nb_log_entries = len(revision_log_sorted) - 2 * per_page if nb_log_entries > per_page: nb_log_entries = per_page assert_contains(resp, 'Newer' % escape(prev_page_url) ) if len(revision_log_sorted) > 3 * per_page: assert_contains( resp, 'Older' % escape(next_page_url), ) @given(revision(), unknown_revision(), new_origin()) def test_revision_request_errors(client, revision, unknown_revision, new_origin): url = reverse("browse-revision", url_args={"sha1_git": unknown_revision}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains( resp, "Revision with sha1_git %s not found" % unknown_revision, status_code=404 ) url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params={"origin_url": new_origin.url}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains( resp, "the origin mentioned in your request" " appears broken", status_code=404 ) @given(revision()) def test_revision_uppercase(client, revision): url = reverse( "browse-revision-uppercase-checksum", url_args={"sha1_git": revision.upper()} ) resp = check_html_get_response(client, url, status_code=302) redirect_url = reverse("browse-revision", url_args={"sha1_git": revision}) assert resp["location"] == redirect_url def _revision_browse_checks( client, archive_data, revision, origin_url=None, snapshot=None ): query_params = {} if origin_url: query_params["origin_url"] = origin_url if snapshot: query_params["snapshot"] = snapshot["id"] url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params=query_params ) revision_data = archive_data.revision_get(revision) author_name = revision_data["author"]["name"] committer_name = revision_data["committer"]["name"] dir_id = revision_data["directory"] if origin_url: snapshot = archive_data.snapshot_get_latest(origin_url) history_url = reverse( "browse-origin-log", query_params={"revision": revision, **query_params}, ) elif snapshot: history_url = reverse( "browse-snapshot-log", url_args={"snapshot_id": snapshot["id"]}, query_params={"revision": revision}, ) else: history_url = reverse("browse-revision-log", url_args={"sha1_git": revision}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/revision.html" ) assert_contains(resp, author_name) assert_contains(resp, committer_name) assert_contains(resp, history_url) for parent in revision_data["parents"]: parent_url = reverse( "browse-revision", url_args={"sha1_git": parent}, query_params=query_params ) assert_contains(resp, '%s' % (escape(parent_url), parent[:7])) author_date = revision_data["date"] committer_date = revision_data["committer_date"] message_lines = revision_data["message"].split("\n") assert_contains(resp, format_utc_iso_date(author_date)) assert_contains(resp, format_utc_iso_date(committer_date)) assert_contains(resp, escape(message_lines[0])) assert_contains(resp, escape("\n".join(message_lines[1:]))) assert_contains(resp, "vault-cook-directory") assert_contains(resp, "vault-cook-revision") swh_rev_id = gen_swhid("revision", revision) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) assert_contains(resp, swh_rev_id) assert_contains(resp, swh_rev_id_url) swh_dir_id = gen_swhid("directory", dir_id) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) if origin_url: assert_contains(resp, "swh-take-new-snapshot") swh_rev_id = gen_swhid(REVISION, revision) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) if origin_url: browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin_url} ) assert_contains(resp, f'href="{browse_origin_url}"') elif snapshot: swh_snp_id = gen_swhid("snapshot", snapshot["id"]) swh_snp_id_url = reverse("browse-swhid", url_args={"swhid": swh_snp_id}) assert_contains(resp, f'href="{swh_snp_id_url}"') swhid_context = {} if origin_url: swhid_context["origin"] = origin_url if snapshot: swhid_context["visit"] = gen_swhid(SNAPSHOT, snapshot["id"]) swh_rev_id = gen_swhid(REVISION, revision, metadata=swhid_context) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) assert_contains(resp, swh_rev_id) assert_contains(resp, swh_rev_id_url) swhid_context["anchor"] = gen_swhid(REVISION, revision) swh_dir_id = gen_swhid(DIRECTORY, dir_id, metadata=swhid_context) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) @given(revision()) def test_revision_invalid_path(client, archive_data, revision): path = "foo/bar" url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params={"path": path} ) resp = check_html_get_response( client, url, status_code=404, template_used="browse/revision.html" ) directory = archive_data.revision_get(revision)["directory"] error_message = ( f"Directory entry with path {path} from root directory {directory} not found" ) assert_contains(resp, error_message, status_code=404) assert_not_contains(resp, "swh-metadata-popover", status_code=404) @given(directory(), new_person(), new_swh_date()) def test_revision_metadata_display(archive_data, client, directory, person, date): metadata = {"foo": "bar"} revision = Revision( directory=hash_to_bytes(directory), author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, metadata=metadata, ) archive_data.revision_add([revision]) url = reverse("browse-revision", url_args={"sha1_git": hash_to_hex(revision.id)}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/revision.html" ) assert_contains(resp, "swh-metadata-popover") assert_contains(resp, escape(json.dumps(metadata, indent=4))) diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py index 1bf03f80..d9faf684 100644 --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -1,759 +1,762 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from functools import partial import re from typing import Optional +import uuid import iso8601 import pytest import requests from swh.core.pytest_plugin import get_response_cb +from swh.scheduler.utils import create_oneshot_task_dict from swh.web.common.exc import BadInputExc from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_FULL, VISIT_STATUS_ONGOING, VISIT_STATUS_PARTIAL, SaveOriginRequest, ) from swh.web.common.origin_save import ( _check_origin_exists, _check_visit_type_savable, _visit_type_task, _visit_type_task_privileged, get_savable_visit_types, get_save_origin_requests, get_save_origin_task_info, origin_exists, refresh_save_origin_request_statuses, ) from swh.web.common.typing import ( OriginExistenceCheckInfo, OriginVisitInfo, SaveOriginRequestInfo, ) from swh.web.config import get_config _es_url = "http://esnode1.internal.softwareheritage.org:9200" _es_workers_index_url = "%s/swh_workers-*" % _es_url _origin_url = "https://gitlab.com/inkscape/inkscape" _visit_type = "git" -_task_id = 203525448 +_task_id = 1 @pytest.fixture(autouse=True) def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with post method""" cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.post(re.compile("https?://"), body=cb) return requests_mock_datadir @pytest.mark.django_db -def test_get_save_origin_archived_task_info(mocker): - _get_save_origin_task_info_test(mocker, task_archived=True) +def test_get_save_origin_archived_task_info(swh_scheduler): + _get_save_origin_task_info_test(swh_scheduler, task_archived=True) @pytest.mark.django_db -def test_get_save_origin_task_full_info_with_es(mocker): - _get_save_origin_task_info_test(mocker, es_available=True) +def test_get_save_origin_task_info_without_es(swh_scheduler): + _get_save_origin_task_info_test(swh_scheduler, es_available=False) -@pytest.mark.django_db -def test_get_save_origin_task_info_with_es(mocker): - _get_save_origin_task_info_test(mocker, es_available=True, full_info=False) - - -@pytest.mark.django_db -def test_get_save_origin_task_info_without_es(mocker): - _get_save_origin_task_info_test(mocker, es_available=False) - - -def _mock_scheduler( - mocker, +def _fill_scheduler_db( + swh_scheduler, task_status="completed", task_run_status="eventful", task_archived=False, visit_started_date=None, ): - mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") - task = { - "arguments": {"args": [], "kwargs": {"repo_url": _origin_url},}, - "current_interval": timedelta(days=64), - "id": _task_id, - "next_run": datetime.now(tz=timezone.utc) + timedelta(days=64), - "policy": "oneshot", - "priority": "high", - "retries_left": 0, - "status": task_status, - "type": "load-git", - } - mock_scheduler.get_tasks.return_value = [dict(task) if not task_archived else None] - - task_run = { - "backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205", - "ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5), - "id": 654270631, - "metadata": {}, - "scheduled": datetime.now(tz=timezone.utc), - "started": visit_started_date, - "status": task_run_status, - "task": _task_id, - } - mock_scheduler.get_task_runs.return_value = [ - dict(task_run) if not task_archived else None - ] + task = task_run = None + if not task_archived: + task = swh_scheduler.create_tasks( + [create_oneshot_task_dict("load-git", repo_url=_origin_url)] + )[0] + backend_id = str(uuid.uuid4()) + + if task_status != "next_run_not_scheduled": + swh_scheduler.schedule_task_run(task["id"], backend_id) + + if task_run_status is not None: + swh_scheduler.start_task_run(backend_id) + task_run = dict( + swh_scheduler.end_task_run(backend_id, task_run_status).items() + ) return task, task_run @pytest.mark.parametrize( "wrong_type,privileged_user", [ ("dummy", True), ("dumb", False), ("archives", False), # when no privilege, this is rejected ], ) -def test__check_visit_type_savable(wrong_type, privileged_user): +def test_check_visit_type_savable(wrong_type, privileged_user, swh_scheduler): + + swh_scheduler.add_load_archive_task_type() with pytest.raises(BadInputExc, match="Allowed types"): _check_visit_type_savable(wrong_type, privileged_user) # when privileged_user, the following is accepted though _check_visit_type_savable("archives", True) -def test_get_savable_visit_types(): +def test_get_savable_visit_types(swh_scheduler): + + swh_scheduler.add_load_archive_task_type() + default_list = list(_visit_type_task.keys()) assert set(get_savable_visit_types()) == set(default_list) privileged_list = default_list.copy() privileged_list += list(_visit_type_task_privileged.keys()) assert set(get_savable_visit_types(privileged_user=True)) == set(privileged_list) def _get_save_origin_task_info_test( - mocker, task_archived=False, es_available=True, full_info=True + swh_scheduler, task_archived=False, es_available=True, full_info=True ): swh_web_config = get_config() if es_available: swh_web_config.update({"es_workers_index_url": _es_workers_index_url}) else: swh_web_config.update({"es_workers_index_url": ""}) sor = SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, origin_url="https://gitlab.com/inkscape/inkscape", status=SAVE_REQUEST_ACCEPTED, visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), loading_task_id=_task_id, ) - task, task_run = _mock_scheduler(mocker, task_archived=task_archived) + task, task_run = _fill_scheduler_db(swh_scheduler, task_archived=task_archived) es_response = requests.post("%s/_search" % _es_workers_index_url).json() task_exec_data = es_response["hits"]["hits"][-1]["_source"] sor_task_info = get_save_origin_task_info(sor.id, full_info=full_info) expected_result = ( { "type": task["type"], "arguments": task["arguments"], "id": task["id"], "backend_id": task_run["backend_id"], "scheduled": task_run["scheduled"], "started": task_run["started"], "ended": task_run["ended"], "status": task_run["status"], "visit_status": sor.visit_status, } if not task_archived else {} ) if es_available and not task_archived: expected_result.update( { "message": task_exec_data["message"], "name": task_exec_data["swh_task_name"], "worker": task_exec_data["hostname"], } ) if not full_info: expected_result.pop("id", None) expected_result.pop("backend_id", None) expected_result.pop("worker", None) if "message" in expected_result: message = "" message_lines = expected_result["message"].split("\n") for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] expected_result["message"] = message assert sor_task_info == expected_result @pytest.mark.django_db -def test_get_save_origin_requests_find_visit_date(mocker): +def test_get_save_origin_requests_find_visit_date(mocker, swh_scheduler): # create a save request SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archive - _mock_scheduler(mocker) + _fill_scheduler_db(swh_scheduler) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} mock_get_origin_visits = mocker.patch( "swh.web.common.origin_save.get_origin_visits" ) # create a visit for the save request visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34, ) mock_get_origin_visits.return_value = [visit_info] # check visit date has been correctly found sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] == visit_date mock_get_origin_visits.assert_called_once() # check visit is not searched again when it has been found get_save_origin_requests(_visit_type, _origin_url) mock_get_origin_visits.assert_called_once() # check visit date are not searched for save requests older than # one month sor = SaveOriginRequest.objects.create( visit_type=_visit_type, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, loading_task_id=_task_id, visit_date=None, ) sor.request_date = datetime.now(tz=timezone.utc) - timedelta(days=31) sor.save() - _mock_scheduler(mocker, task_status="disabled", task_run_status="failed") + _fill_scheduler_db(swh_scheduler, task_status="disabled", task_run_status="failed") sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 2 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_date"] is None mock_get_origin_visits.assert_called_once() def _get_save_origin_requests( - mocker, load_status, visit_status, request_date: Optional[datetime] = None + mocker, + swh_scheduler, + load_status, + visit_status, + request_date: Optional[datetime] = None, ): """Wrapper around the get_origin_save_origin_request call. """ SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=visit_status, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives - _mock_scheduler( - mocker, task_status="next_run_scheduled", task_run_status=load_status + _fill_scheduler_db( + swh_scheduler, task_status="next_run_scheduled", task_run_status=load_status ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} mock_get_origin_visits = mocker.patch( "swh.web.common.origin_save.get_origin_visits" ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=visit_status, type=_visit_type, url="", visit=34, ) mock_get_origin_visits.return_value = [visit_info] sors = get_save_origin_requests(_visit_type, _origin_url) mock_get_origin_visits.assert_called_once() return sors @pytest.mark.parametrize("visit_date", [None, "some-date"]) def test_from_save_origin_request_to_save_request_info_dict(visit_date): """Ensure save request to json serializable dict is fine """ request_date = datetime.now(tz=timezone.utc) _visit_date = request_date + timedelta(minutes=5) if visit_date else None request_date = datetime.now(tz=timezone.utc) sor = SaveOriginRequest( request_date=request_date, visit_type=_visit_type, visit_status=VISIT_STATUS_FULL, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, loading_task_status=None, visit_date=_visit_date, loading_task_id=1, ) assert sor.to_dict() == SaveOriginRequestInfo( id=sor.id, origin_url=sor.origin_url, visit_type=sor.visit_type, save_request_date=sor.request_date.isoformat(), save_request_status=sor.status, save_task_status=sor.loading_task_status, visit_status=sor.visit_status, visit_date=_visit_date.isoformat() if _visit_date else None, loading_task_id=sor.loading_task_id, ) def test__check_origin_exists_404(requests_mock): url_ko = "https://example.org/some-inexistant-url" requests_mock.head(url_ko, status_code=404) with pytest.raises(BadInputExc, match="not exist"): _check_origin_exists(url_ko) def test__check_origin_exists_200(requests_mock): url = "https://example.org/url" requests_mock.head(url, status_code=200) # passes the check actual_metadata = _check_origin_exists(url) # and we actually may have retrieved some metadata on the origin assert actual_metadata == origin_exists(url) def test_origin_exists_404(requests_mock): """Origin which does not exist should be reported as inexistent""" url_ko = "https://example.org/some-inexistant-url" requests_mock.head(url_ko, status_code=404) actual_result = origin_exists(url_ko) assert actual_result == OriginExistenceCheckInfo( origin_url=url_ko, exists=False, last_modified=None, content_length=None, ) def test_origin_exists_200_no_data(requests_mock): """Existing origin should be reported as such (no extra information)""" url = "http://example.org/real-url" requests_mock.head( url, status_code=200, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, last_modified=None, content_length=None, ) def test_origin_exists_200_with_data(requests_mock): """Existing origin should be reported as such (+ extra information)""" url = "http://example.org/real-url" requests_mock.head( url, status_code=200, headers={ "content-length": "10", "last-modified": "Sun, 21 Aug 2011 16:26:32 GMT", }, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=10, last_modified="2011-08-21T16:26:32", ) def test_origin_exists_internet_archive(requests_mock): """Edge case where an artifact URL to check existence is hosted on the Internet Archive""" url = ( "https://web.archive.org/web/20100705043309/" "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" ) redirect_url = ( "https://web.archive.org/web/20100610004108/" "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" ) requests_mock.head( url, status_code=302, headers={"Location": redirect_url,}, ) requests_mock.head( redirect_url, status_code=200, headers={ "X-Archive-Orig-Last-Modified": "Tue, 12 May 2009 22:09:43 GMT", "X-Archive-Orig-Content-Length": "121421", }, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=121421, last_modified="2009-05-12T22:09:43", ) def test_origin_exists_200_with_data_unexpected_date_format(requests_mock): """Existing origin should be ok, unexpected last modif time result in no time""" url = "http://example.org/real-url2" # this is parsable but not as expected unexpected_format_date = "Sun, 21 Aug 2021 16:26:32" requests_mock.head( url, status_code=200, headers={"last-modified": unexpected_format_date,}, ) actual_result = origin_exists(url) # so the resulting date is None assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=None, last_modified=None, ) @pytest.mark.django_db @pytest.mark.parametrize("visit_status", [VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING,]) -def test_get_save_origin_requests_no_visit_date_found(mocker, visit_status): +def test_get_save_origin_requests_no_visit_date_found( + mocker, swh_scheduler, visit_status +): """Uneventful visits with failed visit status are marked as failed """ sors = _get_save_origin_requests( - mocker, load_status="scheduled", visit_status=visit_status, + mocker, swh_scheduler, load_status="scheduled", visit_status=visit_status, ) # check no visit date has been found assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_RUNNING assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize("visit_status", ["not_found", "failed",]) -def test_get_save_origin_requests_no_failed_status_override(mocker, visit_status): +def test_get_save_origin_requests_no_failed_status_override( + mocker, swh_scheduler, visit_status +): """Uneventful visits with failed statuses (failed, not found) are marked as failed """ sors = _get_save_origin_requests( - mocker, load_status="uneventful", visit_status=visit_status + mocker, swh_scheduler, load_status="uneventful", visit_status=visit_status ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED visit_date = sors[0]["visit_date"] assert visit_date is not None sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize( "load_status,visit_status", [ ("eventful", VISIT_STATUS_FULL), ("eventful", VISIT_STATUS_PARTIAL), ("uneventful", VISIT_STATUS_PARTIAL), ], ) -def test_get_visit_info_for_save_request_succeeded(mocker, load_status, visit_status): +def test_get_visit_info_for_save_request_succeeded( + mocker, swh_scheduler, load_status, visit_status +): """Nominal scenario, below 30 days, returns something""" sors = _get_save_origin_requests( - mocker, load_status=load_status, visit_status=visit_status + mocker, swh_scheduler, load_status=load_status, visit_status=visit_status ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status sors = get_save_origin_requests(_visit_type, _origin_url) assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize("load_status", ["eventful", "uneventful",]) -def test_get_visit_info_incomplete_visit_still_successful(mocker, load_status): +def test_get_visit_info_incomplete_visit_still_successful( + mocker, swh_scheduler, load_status +): """Incomplete visit information, yet the task is updated partially """ sors = _get_save_origin_requests( - mocker, load_status=load_status, visit_status=None, + mocker, swh_scheduler, load_status=load_status, visit_status=None, ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED # As the entry is missing the following information though assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None # It's still detected as to be updated by the refresh routine sors = refresh_save_origin_request_statuses() assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None @pytest.mark.django_db -def test_refresh_in_progress_save_request_statuses(mocker, api_client, archive_data): +def test_refresh_in_progress_save_request_statuses( + mocker, swh_scheduler, api_client, archive_data +): """Refresh a pending save origins requests and update if the status changes """ date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) visit_started_date = date_now - timedelta(minutes=1) # returned visit status SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=VISIT_STATUS_CREATED, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives - _mock_scheduler( - mocker, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED + _fill_scheduler_db( + swh_scheduler, + task_status="next_run_scheduled", + task_run_status=SAVE_TASK_SCHEDULED, ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} mock_get_origin_visits = mocker.patch( "swh.web.common.origin_save.get_origin_visits" ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_CREATED, type=_visit_type, url="", visit=34, ) mock_get_origin_visits.return_value = [visit_info] # make the scheduler return a running event - _mock_scheduler( - mocker, + _fill_scheduler_db( + swh_scheduler, task_status="next_run_scheduled", task_run_status="started", visit_started_date=visit_started_date, ) # The visit is detected but still running sors = refresh_save_origin_request_statuses() assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # The status is updated assert sor["save_task_status"] == SAVE_TASK_RUNNING # but the following entries are missing so it's not updated assert sor["visit_date"] is not None assert sor["visit_status"] == VISIT_STATUS_CREATED # make the visit status completed # make the scheduler return a running event - _mock_scheduler( - mocker, + _fill_scheduler_db( + swh_scheduler, task_status="completed", task_run_status="eventful", visit_started_date=visit_started_date, ) # This time around, the origin returned will have all required information updated # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) mock_get_origin_visits.return_value = [visit_info] # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_FULL # Once in final state, a sor should not be updated anymore sors = refresh_save_origin_request_statuses() assert len(sors) == 0 @pytest.mark.django_db -def test_refresh_save_request_statuses(mocker, api_client, archive_data): +def test_refresh_save_request_statuses(mocker, swh_scheduler, api_client, archive_data): """Refresh filters save origins requests and update if changes """ date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) # returned visit status SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=None, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives - _mock_scheduler( - mocker, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED + _fill_scheduler_db( + swh_scheduler, + task_status="next_run_scheduled", + task_run_status=SAVE_TASK_SCHEDULED, ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} mock_get_origin_visits = mocker.patch( "swh.web.common.origin_save.get_origin_visits" ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_CREATED, type=_visit_type, url="", visit=34, ) mock_get_origin_visits.return_value = [visit_info] # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_CREATED # A save code now entry is detected for update, but as nothing changes, the entry # remains in the same state sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # Status is not updated as no new information is available on the visit status # and the task status has not moved assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_CREATED # This time around, the origin returned will have all information updated # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34, ) mock_get_origin_visits.return_value = [visit_info] # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_FULL # This time, nothing left to update sors = refresh_save_origin_request_statuses() assert len(sors) == 0 diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py index b443dd8a..fead2e30 100644 --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -1,431 +1,482 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import timedelta import json import os import shutil from subprocess import PIPE, run import sys from typing import Any, Dict, List, Optional from _pytest.python import Function from hypothesis import HealthCheck, settings import pytest from django.core.cache import cache from rest_framework.test import APIClient, APIRequestFactory from swh.model.hashutil import ALGORITHMS, hash_to_bytes +from swh.scheduler.tests.common import TASK_TYPES from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches, snapshot_get_latest from swh.web.auth.utils import OIDC_SWH_WEB_CLIENT_ID from swh.web.common import converters +from swh.web.common.origin_save import get_scheduler_load_task_types from swh.web.common.typing import OriginVisitInfo from swh.web.config import get_config from swh.web.tests.data import get_tests_data, override_storages # Used to skip some tests ctags_json_missing = ( shutil.which("ctags") is None or b"+json" not in run(["ctags", "--version"], stdout=PIPE).stdout ) fossology_missing = shutil.which("nomossa") is None # Register some hypothesis profiles settings.register_profile("default", settings()) # we use getattr here to keep mypy happy regardless hypothesis version function_scoped_fixture_check = ( [getattr(HealthCheck, "function_scoped_fixture")] if hasattr(HealthCheck, "function_scoped_fixture") else [] ) suppress_health_check = [ HealthCheck.too_slow, HealthCheck.filter_too_much, ] + function_scoped_fixture_check settings.register_profile( "swh-web", settings(deadline=None, suppress_health_check=suppress_health_check,), ) settings.register_profile( "swh-web-fast", settings( deadline=None, max_examples=5, suppress_health_check=suppress_health_check, ), ) def pytest_configure(config): # Use fast hypothesis profile by default if none has been # explicitly specified in pytest option if config.getoption("--hypothesis-profile") is None: settings.load_profile("swh-web-fast") # Small hack in order to be able to run the unit tests # without static assets generated by webpack. # Those assets are not really needed for the Python tests # but the django templates will fail to load due to missing # generated file webpack-stats.json describing the js and css # files to include. # So generate a dummy webpack-stats.json file to overcome # that issue. test_dir = os.path.dirname(__file__) # location of the static folder when running tests through tox data_dir = os.path.join(sys.prefix, "share/swh/web") static_dir = os.path.join(data_dir, "static") if not os.path.exists(static_dir): # location of the static folder when running tests locally with pytest static_dir = os.path.join(test_dir, "../../../static") webpack_stats = os.path.join(static_dir, "webpack-stats.json") if os.path.exists(webpack_stats): return bundles_dir = os.path.join(test_dir, "../../../assets/src/bundles") if not os.path.exists(bundles_dir): # location of the bundles folder when running tests with tox bundles_dir = os.path.join(data_dir, "assets/src/bundles") _, bundles, _ = next(os.walk(bundles_dir)) mock_webpack_stats = { "status": "done", "publicPath": "/static", "chunks": {}, "assets": {}, } for bundle in bundles: asset = f"js/{bundle}.js" mock_webpack_stats["chunks"][bundle] = [asset] mock_webpack_stats["assets"][asset] = { "name": asset, "publicPath": f"/static/{asset}", } with open(webpack_stats, "w") as outfile: json.dump(mock_webpack_stats, outfile) # Clear Django cache before each test @pytest.fixture(autouse=True) def django_cache_cleared(): cache.clear() # Alias rf fixture from pytest-django @pytest.fixture def request_factory(rf): return rf # Fixture to get test client from Django REST Framework @pytest.fixture def api_client(): return APIClient() # Fixture to get API request factory from Django REST Framework @pytest.fixture def api_request_factory(): return APIRequestFactory() # Initialize tests data @pytest.fixture(scope="function", autouse=True) def tests_data(): data = get_tests_data(reset=True) # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module override_storages( data["storage"], data["idx_storage"], data["search"], data["counters"] ) return data # Fixture to manipulate data from a sample archive used in the tests @pytest.fixture(scope="function") def archive_data(tests_data): return _ArchiveData(tests_data) # Fixture to manipulate indexer data from a sample archive used in the tests @pytest.fixture(scope="function") def indexer_data(tests_data): return _IndexerData(tests_data) # Custom data directory for requests_mock @pytest.fixture def datadir(): return os.path.join(os.path.abspath(os.path.dirname(__file__)), "resources") class _ArchiveData: """ Helper class to manage data from a sample test archive. It is initialized with a reference to an in-memory storage containing raw tests data. It is basically a proxy to Storage interface but it overrides some methods to retrieve those tests data in a json serializable format in order to ease tests implementation. """ def __init__(self, tests_data): self.storage = tests_data["storage"] def __getattr__(self, key): if key == "storage": raise AttributeError(key) # Forward calls to non overridden Storage methods to wrapped # storage instance return getattr(self.storage, key) def content_find(self, content: Dict[str, Any]) -> Dict[str, Any]: cnt_ids_bytes = { algo_hash: hash_to_bytes(content[algo_hash]) for algo_hash in ALGORITHMS if content.get(algo_hash) } cnt = self.storage.content_find(cnt_ids_bytes) return converters.from_content(cnt[0].to_dict()) if cnt else cnt def content_get(self, cnt_id: str) -> Dict[str, Any]: cnt_id_bytes = hash_to_bytes(cnt_id) content = self.storage.content_get([cnt_id_bytes])[0] if content: content_d = content.to_dict() content_d.pop("ctime", None) else: content_d = None return converters.from_swh( content_d, hashess={"sha1", "sha1_git", "sha256", "blake2s256"} ) def content_get_data(self, cnt_id: str) -> Optional[Dict[str, Any]]: cnt_id_bytes = hash_to_bytes(cnt_id) cnt_data = self.storage.content_get_data(cnt_id_bytes) if cnt_data is None: return None return converters.from_content({"data": cnt_data, "sha1": cnt_id_bytes}) def directory_get(self, dir_id): return {"id": dir_id, "content": self.directory_ls(dir_id)} def directory_ls(self, dir_id): cnt_id_bytes = hash_to_bytes(dir_id) dir_content = map( converters.from_directory_entry, self.storage.directory_ls(cnt_id_bytes) ) return list(dir_content) def release_get(self, rel_id: str) -> Optional[Dict[str, Any]]: rel_id_bytes = hash_to_bytes(rel_id) rel_data = self.storage.release_get([rel_id_bytes])[0] return converters.from_release(rel_data) if rel_data else None def revision_get(self, rev_id: str) -> Optional[Dict[str, Any]]: rev_id_bytes = hash_to_bytes(rev_id) rev_data = self.storage.revision_get([rev_id_bytes])[0] return converters.from_revision(rev_data) if rev_data else None def revision_log(self, rev_id, limit=None): rev_id_bytes = hash_to_bytes(rev_id) return list( map( converters.from_revision, self.storage.revision_log([rev_id_bytes], limit=limit), ) ) def snapshot_get_latest(self, origin_url): snp = snapshot_get_latest(self.storage, origin_url) return converters.from_snapshot(snp.to_dict()) def origin_get(self, origin_urls): origins = self.storage.origin_get(origin_urls) return [converters.from_origin(o.to_dict()) for o in origins] def origin_visit_get(self, origin_url): next_page_token = None visits = [] while True: visit_page = self.storage.origin_visit_get( origin_url, page_token=next_page_token ) next_page_token = visit_page.next_page_token for visit in visit_page.results: visit_status = self.storage.origin_visit_status_get_latest( origin_url, visit.visit ) visits.append( converters.from_origin_visit( {**visit_status.to_dict(), "type": visit.type} ) ) if not next_page_token: break return visits def origin_visit_get_by(self, origin_url: str, visit_id: int) -> OriginVisitInfo: visit = self.storage.origin_visit_get_by(origin_url, visit_id) assert visit is not None visit_status = self.storage.origin_visit_status_get_latest(origin_url, visit_id) assert visit_status is not None return converters.from_origin_visit( {**visit_status.to_dict(), "type": visit.type} ) def origin_visit_status_get_latest( self, origin_url, type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, ): visit_status = origin_get_latest_visit_status( self.storage, origin_url, type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, ) return ( converters.from_origin_visit(visit_status.to_dict()) if visit_status else None ) def snapshot_get(self, snapshot_id): snp = snapshot_get_all_branches(self.storage, hash_to_bytes(snapshot_id)) return converters.from_snapshot(snp.to_dict()) def snapshot_get_branches( self, snapshot_id, branches_from="", branches_count=1000, target_types=None ): partial_branches = self.storage.snapshot_get_branches( hash_to_bytes(snapshot_id), branches_from.encode(), branches_count, target_types, ) return converters.from_partial_branches(partial_branches) def snapshot_get_head(self, snapshot): if snapshot["branches"]["HEAD"]["target_type"] == "alias": target = snapshot["branches"]["HEAD"]["target"] head = snapshot["branches"][target]["target"] else: head = snapshot["branches"]["HEAD"]["target"] return head def snapshot_count_branches(self, snapshot_id): counts = dict.fromkeys(("alias", "release", "revision"), 0) counts.update(self.storage.snapshot_count_branches(hash_to_bytes(snapshot_id))) counts.pop(None, None) return counts class _IndexerData: """ Helper class to manage indexer tests data It is initialized with a reference to an in-memory indexer storage containing raw tests data. It also defines class methods to retrieve those tests data in a json serializable format in order to ease tests implementation. """ def __init__(self, tests_data): self.idx_storage = tests_data["idx_storage"] self.mimetype_indexer = tests_data["mimetype_indexer"] self.license_indexer = tests_data["license_indexer"] self.ctags_indexer = tests_data["ctags_indexer"] def content_add_mimetype(self, cnt_id): self.mimetype_indexer.run([hash_to_bytes(cnt_id)]) def content_get_mimetype(self, cnt_id): mimetype = self.idx_storage.content_mimetype_get([hash_to_bytes(cnt_id)])[ 0 ].to_dict() return converters.from_filetype(mimetype) def content_add_license(self, cnt_id): self.license_indexer.run([hash_to_bytes(cnt_id)]) def content_get_license(self, cnt_id): cnt_id_bytes = hash_to_bytes(cnt_id) licenses = self.idx_storage.content_fossology_license_get([cnt_id_bytes]) for license in licenses: yield converters.from_swh(license.to_dict(), hashess={"id"}) def content_add_ctags(self, cnt_id): self.ctags_indexer.run([hash_to_bytes(cnt_id)]) def content_get_ctags(self, cnt_id): cnt_id_bytes = hash_to_bytes(cnt_id) ctags = self.idx_storage.content_ctags_get([cnt_id_bytes]) for ctag in ctags: yield converters.from_swh(ctag, hashess={"id"}) @pytest.fixture def keycloak_oidc(keycloak_oidc, mocker): keycloak_config = get_config()["keycloak"] keycloak_oidc.server_url = keycloak_config["server_url"] keycloak_oidc.realm_name = keycloak_config["realm_name"] keycloak_oidc.client_id = OIDC_SWH_WEB_CLIENT_ID keycloak_oidc_client = mocker.patch("swh.web.auth.views.keycloak_oidc_client") keycloak_oidc_client.return_value = keycloak_oidc return keycloak_oidc @pytest.fixture def subtest(request): """A hack to explicitly set up and tear down fixtures. This fixture allows you to set up and tear down fixtures within the test function itself. This is useful (necessary!) for using Hypothesis inside pytest, as hypothesis will call the test function multiple times, without setting up or tearing down fixture state as it is normally the case. Copied from the pytest-subtesthack project, public domain license (https://github.com/untitaker/pytest-subtesthack). """ parent_test = request.node def inner(func): if hasattr(Function, "from_parent"): item = Function.from_parent( parent_test, name=request.function.__name__ + "[]", originalname=request.function.__name__, callobj=func, ) else: item = Function( name=request.function.__name__ + "[]", parent=parent_test, callobj=func ) nextitem = parent_test # prevents pytest from tearing down module fixtures item.ihook.pytest_runtest_setup(item=item) item.ihook.pytest_runtest_call(item=item) item.ihook.pytest_runtest_teardown(item=item, nextitem=nextitem) return inner + + +@pytest.fixture +def swh_scheduler(swh_scheduler): + config = get_config() + scheduler = config["scheduler"] + config["scheduler"] = swh_scheduler + # create load-git and load-hg task types + for task_type in TASK_TYPES.values(): + swh_scheduler.create_task_type(task_type) + # create load-svn task type + swh_scheduler.create_task_type( + { + "type": "load-svn", + "description": "Update a mercurial repository", + "backend_name": "swh.loader.svn.tasks.DumpMountAndLoadSvnRepository", + "default_interval": timedelta(days=64), + "min_interval": timedelta(hours=12), + "max_interval": timedelta(days=64), + "backoff_factor": 2, + "max_queue_length": None, + "num_retries": 7, + "retry_delay": timedelta(hours=2), + } + ) + + # add method to add load-archive-files task type during tests + def add_load_archive_task_type(): + swh_scheduler.create_task_type( + { + "type": "load-archive-files", + "description": "Load tarballs", + "backend_name": "swh.loader.package.archive.tasks.LoadArchive", + "default_interval": timedelta(days=64), + "min_interval": timedelta(hours=12), + "max_interval": timedelta(days=64), + "backoff_factor": 2, + "max_queue_length": None, + "num_retries": 7, + "retry_delay": timedelta(hours=2), + } + ) + + swh_scheduler.add_load_archive_task_type = add_load_archive_task_type + + yield swh_scheduler + config["scheduler"] = scheduler + get_scheduler_load_task_types.cache_clear() diff --git a/swh/web/tests/misc/test_metrics.py b/swh/web/tests/misc/test_metrics.py index 8b39b5b1..995ed451 100644 --- a/swh/web/tests/misc/test_metrics.py +++ b/swh/web/tests/misc/test_metrics.py @@ -1,131 +1,131 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta from itertools import product import random from prometheus_client.exposition import CONTENT_TYPE_LATEST import pytest from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SaveOriginRequest, ) from swh.web.common.origin_save import ( ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, ACCEPTED_SAVE_REQUESTS_METRIC, SUBMITTED_SAVE_REQUESTS_METRIC, get_savable_visit_types, ) from swh.web.common.utils import reverse from swh.web.tests.django_asserts import assert_contains from swh.web.tests.utils import check_http_get_response @pytest.mark.django_db -def test_origin_save_metrics(client): +def test_origin_save_metrics(client, swh_scheduler): visit_types = get_savable_visit_types() request_statuses = ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING, ) load_task_statuses = ( SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, ) for _ in range(random.randint(50, 100)): visit_type = random.choice(visit_types) request_satus = random.choice(request_statuses) load_task_status = random.choice(load_task_statuses) sor = SaveOriginRequest.objects.create( origin_url="origin", visit_type=visit_type, status=request_satus, loading_task_status=load_task_status, ) if load_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED): delay = random.choice(range(60)) sor.visit_date = sor.request_date + timedelta(seconds=delay) sor.save() # Note that this injects dates in the future for the sake of the test only url = reverse("metrics-prometheus") resp = check_http_get_response( client, url, status_code=200, content_type=CONTENT_TYPE_LATEST ) accepted_requests = SaveOriginRequest.objects.filter(status=SAVE_REQUEST_ACCEPTED) labels_set = product(visit_types, load_task_statuses) for labels in labels_set: sor_count = accepted_requests.filter( visit_type=labels[0], loading_task_status=labels[1] ).count() metric_text = ( f"{ACCEPTED_SAVE_REQUESTS_METRIC}{{" f'load_task_status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(sor_count)}\n' ) assert_contains(resp, metric_text) labels_set = product(visit_types, request_statuses) for labels in labels_set: sor_count = SaveOriginRequest.objects.filter( visit_type=labels[0], status=labels[1] ).count() metric_text = ( f"{SUBMITTED_SAVE_REQUESTS_METRIC}{{" f'status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(sor_count)}\n' ) assert_contains(resp, metric_text) # delay metrics save_requests = SaveOriginRequest.objects.all() labels_set = product(visit_types, (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED,)) for labels in labels_set: sors = save_requests.filter( visit_type=labels[0], loading_task_status=labels[1], visit_date__isnull=False, ) delay = 0 for sor in sors: delay += sor.visit_date.timestamp() - sor.request_date.timestamp() metric_delay_text = ( f"{ACCEPTED_SAVE_REQUESTS_DELAY_METRIC}{{" f'load_task_status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(delay)}\n' ) assert_contains(resp, metric_delay_text) diff --git a/swh/web/tests/misc/test_origin_save.py b/swh/web/tests/misc/test_origin_save.py index 6eaa4787..b357a138 100644 --- a/swh/web/tests/misc/test_origin_save.py +++ b/swh/web/tests/misc/test_origin_save.py @@ -1,180 +1,153 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone import json import pytest from swh.auth.django.utils import oidc_user_from_profile -from swh.web.auth.utils import SWH_AMBASSADOR_PERMISSION from swh.web.common.models import SaveOriginRequest from swh.web.common.origin_save import SAVE_REQUEST_ACCEPTED, SAVE_TASK_SUCCEEDED from swh.web.common.utils import reverse from swh.web.tests.utils import check_http_get_response VISIT_TYPES = ("git", "svn", "hg") PRIVILEGED_VISIT_TYPES = tuple(list(VISIT_TYPES) + ["archives"]) def test_old_save_url_redirection(client): url = reverse("browse-origin-save") redirect_url = reverse("origin-save") resp = check_http_get_response(client, url, status_code=302) assert resp["location"] == redirect_url -def test_save_types_list_default(client): - """Unprivileged listing should display default list of visit types. - - """ - url = reverse("origin-save-types-list") - resp = check_http_get_response(client, url, status_code=200) - - actual_response = resp.json() - assert set(actual_response) == set(VISIT_TYPES) - - -@pytest.mark.django_db -def test_save_types_list_privileged(client, keycloak_oidc): - """Privileged listing should display all visit types. - - """ - keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] - client.login(code="", code_verifier="", redirect_uri="") - - url = reverse("origin-save-types-list") - resp = check_http_get_response(client, url, status_code=200) - - actual_response = resp.json() - assert set(actual_response) == set(PRIVILEGED_VISIT_TYPES) - - @pytest.mark.django_db def test_save_origin_requests_list(client, mocker): nb_origins_per_type = 10 for visit_type in VISIT_TYPES: for i in range(nb_origins_per_type): SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=visit_type, origin_url=f"https://{visit_type}.example.org/project{i}", status=SAVE_REQUEST_ACCEPTED, visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), loading_task_id=i, loading_task_status=SAVE_TASK_SUCCEEDED, ) mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") mock_scheduler.get_tasks.return_value = [] mock_scheduler.get_task_runs.return_value = [] # retrieve all save requests in 3 pages, sorted in descending order # of request creation for i, visit_type in enumerate(reversed(VISIT_TYPES)): url = reverse( "origin-save-requests-list", url_args={"status": "all"}, query_params={ "draw": i + 1, "search[value]": "", "order[0][column]": "0", "columns[0][name]": "request_date", "order[0][dir]": "desc", "length": nb_origins_per_type, "start": i * nb_origins_per_type, }, ) resp = check_http_get_response( client, url, status_code=200, content_type="application/json" ) sors = json.loads(resp.content.decode("utf-8")) assert sors["draw"] == i + 1 assert sors["recordsFiltered"] == len(VISIT_TYPES) * nb_origins_per_type assert sors["recordsTotal"] == len(VISIT_TYPES) * nb_origins_per_type assert len(sors["data"]) == nb_origins_per_type assert all(d["visit_type"] == visit_type for d in sors["data"]) # retrieve save requests filtered by visit type in a single page for i, visit_type in enumerate(reversed(VISIT_TYPES)): url = reverse( "origin-save-requests-list", url_args={"status": "all"}, query_params={ "draw": i + 1, "search[value]": visit_type, "order[0][column]": "0", "columns[0][name]": "request_date", "order[0][dir]": "desc", "length": nb_origins_per_type, "start": 0, }, ) resp = check_http_get_response( client, url, status_code=200, content_type="application/json" ) sors = json.loads(resp.content.decode("utf-8")) assert sors["draw"] == i + 1 assert sors["recordsFiltered"] == nb_origins_per_type assert sors["recordsTotal"] == len(VISIT_TYPES) * nb_origins_per_type assert len(sors["data"]) == nb_origins_per_type assert all(d["visit_type"] == visit_type for d in sors["data"]) @pytest.mark.django_db def test_save_origin_requests_list_user_filter(client, mocker, keycloak_oidc): # anonymous user created a save request sor = SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type="svn", origin_url="https://svn.example.org/user/project", status=SAVE_REQUEST_ACCEPTED, visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), loading_task_id=1, loading_task_status=SAVE_TASK_SUCCEEDED, ) # authenticated user created a save request user = oidc_user_from_profile(keycloak_oidc, keycloak_oidc.login()) client.login(code="", code_verifier="", redirect_uri="") sor = SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type="git", origin_url="https://git.example.org/user/project", status=SAVE_REQUEST_ACCEPTED, visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), loading_task_id=2, loading_task_status=SAVE_TASK_SUCCEEDED, user_ids=f'"{user.id}"', ) # filter save requests according to user id url = reverse( "origin-save-requests-list", url_args={"status": "all"}, query_params={ "draw": 1, "search[value]": "", "order[0][column]": "0", "columns[0][name]": "request_date", "order[0][dir]": "desc", "length": 10, "start": "0", "user_requests_only": "1", }, ) resp = check_http_get_response( client, url, status_code=200, content_type="application/json" ) sors = json.loads(resp.content.decode("utf-8")) assert sors["recordsFiltered"] == 1 assert sors["recordsTotal"] == 2 assert sors["data"][0] == sor.to_dict()