diff --git a/assets/src/bundles/save/index.js b/assets/src/bundles/save/index.js
index d2c54ac7..1a893b32 100644
--- a/assets/src/bundles/save/index.js
+++ b/assets/src/bundles/save/index.js
@@ -1,561 +1,561 @@
/**
* Copyright (C) 2018-2021 The Software Heritage developers
* See the AUTHORS file at the top-level directory of this distribution
* License: GNU Affero General Public License version 3, or any later version
* See top-level LICENSE file for more information
*/
import {csrfPost, handleFetchError, isGitRepoUrl, htmlAlert, removeUrlFragment,
getCanonicalOriginURL} from 'utils/functions';
import {swhSpinnerSrc} from 'utils/constants';
import artifactFormRowTemplate from './artifact-form-row.ejs';
let saveRequestsTable;
async function originSaveRequest(
originType, originUrl, extraData,
acceptedCallback, pendingCallback, errorCallback
) {
// Actually trigger the origin save request
const addSaveOriginRequestUrl = Urls.api_1_save_origin(originType, originUrl);
$('.swh-processing-save-request').css('display', 'block');
let headers = {};
let body = null;
if (extraData !== {}) {
body = JSON.stringify(extraData);
headers = {
'Content-Type': 'application/json'
};
};
try {
const response = await csrfPost(addSaveOriginRequestUrl, headers, body);
handleFetchError(response);
const data = await response.json();
$('.swh-processing-save-request').css('display', 'none');
if (data.save_request_status === 'accepted') {
acceptedCallback();
} else {
pendingCallback();
}
} catch (response) {
$('.swh-processing-save-request').css('display', 'none');
const errorData = await response.json();
errorCallback(response.status, errorData);
};
}
function addArtifactVersionAutofillHandler(formId) {
// autofill artifact version input with the filename from
// the artifact url without extensions
$(`#swh-input-artifact-url-${formId}`).on('input', function(event) {
const artifactUrl = $(this).val().trim();
let filename = artifactUrl.split('/').slice(-1)[0];
if (filename !== artifactUrl) {
filename = filename.replace(/tar.*$/, 'tar');
const filenameNoExt = filename.split('.').slice(0, -1).join('.');
const artifactVersion = $(`#swh-input-artifact-version-${formId}`);
if (filenameNoExt !== filename) {
artifactVersion.val(filenameNoExt);
}
}
});
}
export function maybeRequireExtraInputs() {
// Read the actual selected value and depending on the origin type, display some extra
// inputs or hide them. This makes the extra inputs disabled when not displayed.
const originType = $('#swh-input-visit-type').val();
let display = 'none';
let disabled = true;
if (originType === 'archives') {
display = 'flex';
disabled = false;
}
$('.swh-save-origin-archives-form').css('display', display);
if (!disabled) {
// help paragraph must have block display for proper rendering
$('#swh-save-origin-archives-help').css('display', 'block');
}
$('.swh-save-origin-archives-form .form-control').prop('disabled', disabled);
if (originType === 'archives' && $('.swh-save-origin-archives-form').length === 1) {
// insert first artifact row when the archives visit type is selected for the first time
$('.swh-save-origin-archives-form').last().after(
artifactFormRowTemplate({deletableRow: false, formId: 0}));
addArtifactVersionAutofillHandler(0);
}
}
export function addArtifactFormRow() {
const formId = $('.swh-save-origin-artifact-form').length;
$('.swh-save-origin-artifact-form').last().after(
artifactFormRowTemplate({
deletableRow: true,
formId: formId
})
);
addArtifactVersionAutofillHandler(formId);
}
export function deleteArtifactFormRow(event) {
$(event.target).closest('.swh-save-origin-artifact-form').remove();
}
const userRequestsFilterCheckbox = `
`;
export function initOriginSave() {
$(document).ready(() => {
$.fn.dataTable.ext.errMode = 'none';
// set git as the default value as before
$('#swh-input-visit-type').val('git');
saveRequestsTable = $('#swh-origin-save-requests')
.on('error.dt', (e, settings, techNote, message) => {
$('#swh-origin-save-request-list-error').text('An error occurred while retrieving the save requests list');
console.log(message);
})
.DataTable({
serverSide: true,
processing: true,
language: {
processing: ``
},
ajax: {
url: Urls.origin_save_requests_list('all'),
data: (d) => {
if (swh.webapp.isUserLoggedIn() && $('#swh-save-requests-user-filter').prop('checked')) {
d.user_requests_only = '1';
}
}
},
searchDelay: 1000,
// see https://datatables.net/examples/advanced_init/dom_toolbar.html and the comments section
// this option customizes datatables UI components by adding an extra checkbox above the table
// while keeping bootstrap layout
dom: '<"row"<"col-sm-3"l><"col-sm-6 text-left user-requests-filter"><"col-sm-3"f>>' +
'<"row"<"col-sm-12"tr>>' +
'<"row"<"col-sm-5"i><"col-sm-7"p>>',
fnInitComplete: function() {
if (swh.webapp.isUserLoggedIn()) {
$('div.user-requests-filter').html(userRequestsFilterCheckbox);
$('#swh-save-requests-user-filter').on('change', () => {
saveRequestsTable.draw();
});
}
},
columns: [
{
data: 'save_request_date',
name: 'request_date',
render: (data, type, row) => {
if (type === 'display') {
const date = new Date(data);
return date.toLocaleString();
}
return data;
}
},
{
data: 'visit_type',
name: 'visit_type'
},
{
data: 'origin_url',
name: 'origin_url',
render: (data, type, row) => {
if (type === 'display') {
let html = '';
const sanitizedURL = $.fn.dataTable.render.text().display(data);
if (row.save_task_status === 'succeeded') {
let browseOriginUrl = `${Urls.browse_origin()}?origin_url=${encodeURIComponent(sanitizedURL)}`;
if (row.visit_date) {
browseOriginUrl += `×tamp=${encodeURIComponent(row.visit_date)}`;
}
html += `${sanitizedURL}`;
} else {
html += sanitizedURL;
}
html += ` ` +
'';
return html;
}
return data;
}
},
{
data: 'save_request_status',
name: 'status'
},
{
data: 'save_task_status',
name: 'loading_task_status'
},
{
name: 'info',
render: (data, type, row) => {
if (row.save_task_status === 'succeeded' || row.save_task_status === 'failed') {
return ``;
} else {
return '';
}
}
},
{
render: (data, type, row) => {
if (row.save_request_status === 'accepted') {
const saveAgainButton =
'';
return saveAgainButton;
} else {
return '';
}
}
}
],
scrollY: '50vh',
scrollCollapse: true,
order: [[0, 'desc']],
responsive: {
details: {
type: 'none'
}
}
});
swh.webapp.addJumpToPagePopoverToDataTable(saveRequestsTable);
$('#swh-origin-save-requests-list-tab').on('shown.bs.tab', () => {
saveRequestsTable.draw();
window.location.hash = '#requests';
});
$('#swh-origin-save-request-help-tab').on('shown.bs.tab', () => {
removeUrlFragment();
$('.swh-save-request-info').popover('dispose');
});
const saveRequestAcceptedAlert = htmlAlert(
'success',
'The "save code now" request has been accepted and will be processed as soon as possible.',
true
);
const saveRequestPendingAlert = htmlAlert(
'warning',
'The "save code now" request has been put in pending state and may be accepted for processing after manual review.',
true
);
const saveRequestRateLimitedAlert = htmlAlert(
'danger',
'The rate limit for "save code now" requests has been reached. Please try again later.',
true
);
const saveRequestUnknownErrorAlert = htmlAlert(
'danger',
'An unexpected error happened when submitting the "save code now request".',
true
);
$('#swh-save-origin-form').submit(async event => {
event.preventDefault();
event.stopPropagation();
$('.alert').alert('close');
if (event.target.checkValidity()) {
$(event.target).removeClass('was-validated');
const originType = $('#swh-input-visit-type').val();
let originUrl = $('#swh-input-origin-url').val();
originUrl = await getCanonicalOriginURL(originUrl);
// read the extra inputs for the 'archives' type
const extraData = {};
if (originType === 'archives') {
extraData['archives_data'] = [];
for (let i = 0; i < $('.swh-save-origin-artifact-form').length; ++i) {
extraData['archives_data'].push({
'artifact_url': $(`#swh-input-artifact-url-${i}`).val(),
'artifact_version': $(`#swh-input-artifact-version-${i}`).val()
});
}
}
originSaveRequest(originType, originUrl, extraData,
() => $('#swh-origin-save-request-status').html(saveRequestAcceptedAlert),
() => $('#swh-origin-save-request-status').html(saveRequestPendingAlert),
(statusCode, errorData) => {
$('#swh-origin-save-request-status').css('color', 'red');
if (statusCode === 403) {
const errorAlert = htmlAlert('danger', `Error: ${errorData['reason']}`);
$('#swh-origin-save-request-status').html(errorAlert);
} else if (statusCode === 429) {
$('#swh-origin-save-request-status').html(saveRequestRateLimitedAlert);
} else if (statusCode === 400) {
const errorAlert = htmlAlert('danger', errorData['reason']);
$('#swh-origin-save-request-status').html(errorAlert);
} else {
$('#swh-origin-save-request-status').html(saveRequestUnknownErrorAlert);
}
});
} else {
$(event.target).addClass('was-validated');
}
});
$('#swh-show-origin-save-requests-list').on('click', (event) => {
event.preventDefault();
$('.nav-tabs a[href="#swh-origin-save-requests-list"]').tab('show');
});
$('#swh-input-origin-url').on('input', function(event) {
const originUrl = $(this).val().trim();
$(this).val(originUrl);
$('#swh-input-visit-type option').each(function() {
const val = $(this).val();
if (val && originUrl.includes(val)) {
$(this).prop('selected', true);
}
});
});
if (window.location.hash === '#requests') {
$('.nav-tabs a[href="#swh-origin-save-requests-list"]').tab('show');
}
});
}
export function validateSaveOriginUrl(input) {
const originType = $('#swh-input-visit-type').val();
let originUrl = null;
let validUrl = true;
try {
originUrl = new URL(input.value.trim());
} catch (TypeError) {
validUrl = false;
}
if (validUrl) {
- const allowedProtocols = ['http:', 'https:', 'svn:', 'git:'];
+ const allowedProtocols = ['http:', 'https:', 'svn:', 'git:', 'rsync:', 'pserver:', 'ssh:'];
validUrl = (
allowedProtocols.find(protocol => protocol === originUrl.protocol) !== undefined
);
}
if (validUrl && originType === 'git') {
// additional checks for well known code hosting providers
switch (originUrl.hostname) {
case 'github.com':
validUrl = isGitRepoUrl(originUrl);
break;
case 'git.code.sf.net':
validUrl = isGitRepoUrl(originUrl, '/p/');
break;
case 'bitbucket.org':
validUrl = isGitRepoUrl(originUrl);
break;
default:
if (originUrl.hostname.startsWith('gitlab.')) {
validUrl = isGitRepoUrl(originUrl);
}
break;
}
}
if (validUrl) {
input.setCustomValidity('');
} else {
input.setCustomValidity('The origin url is not valid or does not reference a code repository');
}
}
export function initTakeNewSnapshot() {
const newSnapshotRequestAcceptedAlert = htmlAlert(
'success',
'The "take new snapshot" request has been accepted and will be processed as soon as possible.',
true
);
const newSnapshotRequestPendingAlert = htmlAlert(
'warning',
'The "take new snapshot" request has been put in pending state and may be accepted for processing after manual review.',
true
);
const newSnapshotRequestRateLimitAlert = htmlAlert(
'danger',
'The rate limit for "take new snapshot" requests has been reached. Please try again later.',
true
);
const newSnapshotRequestUnknownErrorAlert = htmlAlert(
'danger',
'An unexpected error happened when submitting the "save code now request".',
true
);
$(document).ready(() => {
$('#swh-take-new-snapshot-form').submit(event => {
event.preventDefault();
event.stopPropagation();
const originType = $('#swh-input-visit-type').val();
const originUrl = $('#swh-input-origin-url').val();
const extraData = {};
originSaveRequest(originType, originUrl, extraData,
() => $('#swh-take-new-snapshot-request-status').html(newSnapshotRequestAcceptedAlert),
() => $('#swh-take-new-snapshot-request-status').html(newSnapshotRequestPendingAlert),
(statusCode, errorData) => {
$('#swh-take-new-snapshot-request-status').css('color', 'red');
if (statusCode === 403) {
const errorAlert = htmlAlert('danger', `Error: ${errorData['detail']}`, true);
$('#swh-take-new-snapshot-request-status').html(errorAlert);
} else if (statusCode === 429) {
$('#swh-take-new-snapshot-request-status').html(newSnapshotRequestRateLimitAlert);
} else {
$('#swh-take-new-snapshot-request-status').html(newSnapshotRequestUnknownErrorAlert);
}
});
});
});
}
export function formatValuePerType(type, value) {
// Given some typed value, format and return accordingly formatted value
const mapFormatPerTypeFn = {
'json': (v) => JSON.stringify(v, null, 2),
'date': (v) => new Date(v).toLocaleString(),
'raw': (v) => v,
'duration': (v) => v + ' seconds'
};
return value === null ? null : mapFormatPerTypeFn[type](value);
}
export async function displaySaveRequestInfo(event, saveRequestId) {
event.stopPropagation();
const saveRequestTaskInfoUrl = Urls.origin_save_task_info(saveRequestId);
// close popover when clicking again on the info icon
if ($(event.target).data('bs.popover')) {
$(event.target).popover('dispose');
return;
}
$('.swh-save-request-info').popover('dispose');
$(event.target).popover({
animation: false,
boundary: 'viewport',
container: 'body',
title: 'Save request task information ' +
'`,
content: `
';
for (const info of saveRequestInfo) {
content +=
`
${info.key}
${info.value}
`;
}
content += '
';
}
$('.swh-popover').html(content);
$(event.target).popover('update');
}
export function fillSaveRequestFormAndScroll(visitType, originUrl) {
$('#swh-input-origin-url').val(originUrl);
let originTypeFound = false;
$('#swh-input-visit-type option').each(function() {
const val = $(this).val();
if (val && originUrl.includes(val)) {
$(this).prop('selected', true);
originTypeFound = true;
}
});
if (!originTypeFound) {
$('#swh-input-visit-type option').each(function() {
const val = $(this).val();
if (val === visitType) {
$(this).prop('selected', true);
}
});
}
window.scrollTo(0, 0);
}
diff --git a/cypress/integration/origin-save.spec.js b/cypress/integration/origin-save.spec.js
index 0c245eb0..47dc1c84 100644
--- a/cypress/integration/origin-save.spec.js
+++ b/cypress/integration/origin-save.spec.js
@@ -1,711 +1,711 @@
/**
* Copyright (C) 2019-2021 The Software Heritage developers
* See the AUTHORS file at the top-level directory of this distribution
* License: GNU Affero General Public License version 3, or any later version
* See top-level LICENSE file for more information
*/
let url;
let origin;
const $ = Cypress.$;
const saveCodeMsg = {
'success': 'The "save code now" request has been accepted and will be processed as soon as possible.',
'warning': 'The "save code now" request has been put in pending state and may be accepted for processing after manual review.',
'rejected': 'The "save code now" request has been rejected because the provided origin url is blacklisted.',
'rateLimit': 'The rate limit for "save code now" requests has been reached. Please try again later.',
'not-found': 'The provided url does not exist',
'unknownError': 'An unexpected error happened when submitting the "save code now request',
'csrfError': 'CSRF Failed: Referrer checking failed - no Referrer.'
};
-const anonymousVisitTypes = ['git', 'hg', 'svn'];
-const allVisitTypes = ['archives', 'git', 'hg', 'svn'];
+const anonymousVisitTypes = ['cvs', 'git', 'hg', 'svn'];
+const allVisitTypes = ['archives', 'cvs', 'git', 'hg', 'svn'];
function makeOriginSaveRequest(originType, originUrl) {
cy.get('#swh-input-origin-url')
.type(originUrl)
.get('#swh-input-visit-type')
.select(originType)
.get('#swh-save-origin-form')
.submit();
}
function checkAlertVisible(alertType, msg) {
cy.get('#swh-origin-save-request-status')
.should('be.visible')
.find(`.alert-${alertType}`)
.should('be.visible')
.and('contain', msg);
}
// Stub requests to save an origin
function stubSaveRequest({
requestUrl,
visitType = 'git',
saveRequestStatus,
originUrl,
saveTaskStatus,
responseStatus = 200,
// For error code with the error message in the 'reason' key response
errorMessage = '',
saveRequestDate = new Date(),
visitDate = new Date(),
visitStatus = null
} = {}) {
let response;
if (responseStatus !== 200 && errorMessage) {
response = {
'reason': errorMessage
};
} else {
response = genOriginSaveResponse({visitType: visitType,
saveRequestStatus: saveRequestStatus,
originUrl: originUrl,
saveRequestDate: saveRequestDate,
saveTaskStatus: saveTaskStatus,
visitDate: visitDate,
visitStatus: visitStatus
});
}
cy.intercept('POST', requestUrl, {body: response, statusCode: responseStatus})
.as('saveRequest');
}
// Mocks API response : /save/(:visit_type)/(:origin_url)
// visit_type : {'git', 'hg', 'svn', ...}
function genOriginSaveResponse({
visitType = 'git',
saveRequestStatus,
originUrl,
saveRequestDate = new Date(),
saveTaskStatus,
visitDate = new Date(),
visitStatus
} = {}) {
return {
'visit_type': visitType,
'save_request_status': saveRequestStatus,
'origin_url': originUrl,
'id': 1,
'save_request_date': saveRequestDate ? saveRequestDate.toISOString() : null,
'save_task_status': saveTaskStatus,
'visit_date': visitDate ? visitDate.toISOString() : null,
'visit_status': visitStatus
};
};
describe('Origin Save Tests', function() {
before(function() {
url = this.Urls.origin_save();
origin = this.origin[0];
this.originSaveUrl = this.Urls.api_1_save_origin(origin.type, origin.url);
});
beforeEach(function() {
cy.fixture('origin-save').as('originSaveJSON');
cy.fixture('save-task-info').as('saveTaskInfoJSON');
cy.visit(url);
});
it('should format appropriately values depending on their type', function() {
const inputValues = [ // null values stay null
{type: 'json', value: null, expectedValue: null},
{type: 'date', value: null, expectedValue: null},
{type: 'raw', value: null, expectedValue: null},
{type: 'duration', value: null, expectedValue: null},
// non null values formatted depending on their type
{type: 'json', value: '{}', expectedValue: '"{}"'},
{type: 'date', value: '04/04/2021 01:00:00', expectedValue: '4/4/2021, 1:00:00 AM'},
{type: 'raw', value: 'value-for-identity', expectedValue: 'value-for-identity'},
{type: 'duration', value: '10', expectedValue: '10 seconds'},
{type: 'duration', value: 100, expectedValue: '100 seconds'}
];
cy.window().then(win => {
inputValues.forEach(function(input, index, array) {
const actualValue = win.swh.save.formatValuePerType(input.type, input.value);
assert.equal(actualValue, input.expectedValue);
});
});
});
it('should display accepted message when accepted', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
saveRequestStatus: 'accepted',
originUrl: origin.url,
saveTaskStatus: 'not yet scheduled'});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
});
it('should validate gitlab subproject url', function() {
const gitlabSubProjectUrl = 'https://gitlab.com/user/project/sub/';
const originSaveUrl = this.Urls.api_1_save_origin('git', gitlabSubProjectUrl);
stubSaveRequest({requestUrl: originSaveUrl,
saveRequestStatus: 'accepted',
originurl: gitlabSubProjectUrl,
saveTaskStatus: 'not yet scheduled'});
makeOriginSaveRequest('git', gitlabSubProjectUrl);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
});
it('should validate project url with _ in username', function() {
const gitlabSubProjectUrl = 'https://gitlab.com/user_name/project.git';
const originSaveUrl = this.Urls.api_1_save_origin('git', gitlabSubProjectUrl);
stubSaveRequest({requestUrl: originSaveUrl,
saveRequestStatus: 'accepted',
originurl: gitlabSubProjectUrl,
saveTaskStatus: 'not yet scheduled'});
makeOriginSaveRequest('git', gitlabSubProjectUrl);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
});
it('should display warning message when pending', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
saveRequestStatus: 'pending',
originUrl: origin.url,
saveTaskStatus: 'not created'});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('warning', saveCodeMsg['warning']);
});
});
it('should show error when the origin does not exist (status: 400)', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
originUrl: origin.url,
responseStatus: 400,
errorMessage: saveCodeMsg['not-found']});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('danger', saveCodeMsg['not-found']);
});
});
it('should show error when csrf validation failed (status: 403)', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
saveRequestStatus: 'rejected',
originUrl: origin.url,
saveTaskStatus: 'not created',
responseStatus: 403,
errorMessage: saveCodeMsg['csrfError']});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('danger', saveCodeMsg['csrfError']);
});
});
it('should show error when origin is rejected (status: 403)', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
saveRequestStatus: 'rejected',
originUrl: origin.url,
saveTaskStatus: 'not created',
responseStatus: 403,
errorMessage: saveCodeMsg['rejected']});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('danger', saveCodeMsg['rejected']);
});
});
it('should show error when rate limited (status: 429)', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
saveRequestStatus: 'Request was throttled. Expected available in 60 seconds.',
originUrl: origin.url,
saveTaskStatus: 'not created',
responseStatus: 429});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('danger', saveCodeMsg['rateLimit']);
});
});
it('should show error when unknown error occurs (status other than 200, 403, 429)', function() {
stubSaveRequest({requestUrl: this.originSaveUrl,
saveRequestStatus: 'Error',
originUrl: origin.url,
saveTaskStatus: 'not created',
responseStatus: 406});
makeOriginSaveRequest(origin.type, origin.url);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('danger', saveCodeMsg['unknownError']);
});
});
it('should display origin save info in the requests table', function() {
cy.intercept('/save/requests/list/**', {fixture: 'origin-save'});
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('tbody tr').then(rows => {
let i = 0;
for (const row of rows) {
const cells = row.cells;
const requestDateStr = new Date(this.originSaveJSON.data[i].save_request_date).toLocaleString();
const saveStatus = this.originSaveJSON.data[i].save_task_status;
assert.equal($(cells[0]).text(), requestDateStr);
assert.equal($(cells[1]).text(), this.originSaveJSON.data[i].visit_type);
let html = '';
if (saveStatus === 'succeeded') {
let browseOriginUrl = `${this.Urls.browse_origin()}?origin_url=${encodeURIComponent(this.originSaveJSON.data[i].origin_url)}`;
browseOriginUrl += `×tamp=${encodeURIComponent(this.originSaveJSON.data[i].visit_date)}`;
html += `${this.originSaveJSON.data[i].origin_url}`;
} else {
html += this.originSaveJSON.data[i].origin_url;
}
html += ` `;
html += '';
assert.equal($(cells[2]).html(), html);
assert.equal($(cells[3]).text(), this.originSaveJSON.data[i].save_request_status);
assert.equal($(cells[4]).text(), saveStatus);
++i;
}
});
});
it('should not add timestamp to the browse origin URL is no visit date has been found', function() {
const originUrl = 'https://git.example.org/example.git';
const saveRequestData = genOriginSaveResponse({
saveRequestStatus: 'accepted',
originUrl: originUrl,
saveTaskStatus: 'succeeded',
visitDate: null,
visitStatus: 'full'
});
const saveRequestsListData = {
'recordsTotal': 1,
'draw': 2,
'recordsFiltered': 1,
'data': [saveRequestData]
};
cy.intercept('/save/requests/list/**', {body: saveRequestsListData})
.as('saveRequestsList');
cy.get('#swh-origin-save-requests-list-tab').click();
cy.wait('@saveRequestsList');
cy.get('tbody tr').then(rows => {
const firstRowCells = rows[0].cells;
const browseOriginUrl = `${this.Urls.browse_origin()}?origin_url=${encodeURIComponent(originUrl)}`;
const browseOriginLink = `${originUrl}`;
expect($(firstRowCells[2]).html()).to.have.string(browseOriginLink);
});
});
it('should display/close task info popover when clicking on the info button', function() {
cy.intercept('/save/requests/list/**', {fixture: 'origin-save'});
cy.intercept('/save/task/info/**', {fixture: 'save-task-info'});
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('.swh-save-request-info')
.eq(0)
.click();
cy.get('.swh-save-request-info-popover')
.should('be.visible');
cy.get('.swh-save-request-info')
.eq(0)
.click();
cy.get('.swh-save-request-info-popover')
.should('not.exist');
});
it('should hide task info popover when clicking on the close button', function() {
cy.intercept('/save/requests/list/**', {fixture: 'origin-save'});
cy.intercept('/save/task/info/**', {fixture: 'save-task-info'});
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('.swh-save-request-info')
.eq(0)
.click();
cy.get('.swh-save-request-info-popover')
.should('be.visible');
cy.get('.swh-save-request-info-close')
.click();
cy.get('.swh-save-request-info-popover')
.should('not.exist');
});
it('should fill save request form when clicking on "Save again" button', function() {
cy.intercept('/save/requests/list/**', {fixture: 'origin-save'});
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('.swh-save-origin-again')
.eq(0)
.click();
cy.get('tbody tr').eq(0).then(row => {
const cells = row[0].cells;
cy.get('#swh-input-visit-type')
.should('have.value', $(cells[1]).text());
cy.get('#swh-input-origin-url')
.should('have.value', $(cells[2]).text().slice(0, -1));
});
});
it('should select correct visit type if possible when clicking on "Save again" button', function() {
const originUrl = 'https://gitlab.inria.fr/solverstack/maphys/maphys/';
const badVisitType = 'hg';
const goodVisitType = 'git';
cy.intercept('/save/requests/list/**', {fixture: 'origin-save'});
stubSaveRequest({requestUrl: this.Urls.api_1_save_origin(badVisitType, originUrl),
visitType: badVisitType,
saveRequestStatus: 'accepted',
originUrl: originUrl,
saveTaskStatus: 'failed',
visitStatus: 'failed',
responseStatus: 200,
errorMessage: saveCodeMsg['accepted']});
makeOriginSaveRequest(badVisitType, originUrl);
cy.get('#swh-origin-save-requests-list-tab').click();
cy.wait('@saveRequest').then(() => {
cy.get('.swh-save-origin-again')
.eq(0)
.click();
cy.get('tbody tr').eq(0).then(row => {
const cells = row[0].cells;
cy.get('#swh-input-visit-type')
.should('have.value', goodVisitType);
cy.get('#swh-input-origin-url')
.should('have.value', $(cells[2]).text().slice(0, -1));
});
});
});
it('should create save request for authenticated user', function() {
cy.userLogin();
cy.visit(url);
const originUrl = 'https://git.example.org/account/repo';
stubSaveRequest({requestUrl: this.Urls.api_1_save_origin('git', originUrl),
saveRequestStatus: 'accepted',
originUrl: origin.url,
saveTaskStatus: 'not yet scheduled'});
makeOriginSaveRequest('git', originUrl);
cy.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
});
it('should not show user requests filter checkbox for anonymous users', function() {
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('#swh-save-requests-user-filter').should('not.exist');
});
it('should show user requests filter checkbox for authenticated users', function() {
cy.userLogin();
cy.visit(url);
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('#swh-save-requests-user-filter').should('exist');
});
it('should show only user requests when filter is activated', function() {
cy.intercept('POST', '/api/1/origin/save/**')
.as('saveRequest');
const originAnonymousUser = 'https://some.git.server/project/';
const originAuthUser = 'https://other.git.server/project/';
// anonymous user creates a save request
makeOriginSaveRequest('git', originAnonymousUser);
cy.wait('@saveRequest');
// authenticated user creates another save request
cy.userLogin();
cy.visit(url);
makeOriginSaveRequest('git', originAuthUser);
cy.wait('@saveRequest');
// user requests filter checkbox should be in the DOM
cy.get('#swh-origin-save-requests-list-tab').click();
cy.get('#swh-save-requests-user-filter').should('exist');
// check unfiltered user requests
cy.get('tbody tr').then(rows => {
expect(rows.length).to.eq(2);
expect($(rows[0].cells[2]).text()).to.contain(originAuthUser);
expect($(rows[1].cells[2]).text()).to.contain(originAnonymousUser);
});
// activate filter and check filtered user requests
cy.get('#swh-save-requests-user-filter')
.click({force: true});
cy.get('tbody tr').then(rows => {
expect(rows.length).to.eq(1);
expect($(rows[0].cells[2]).text()).to.contain(originAuthUser);
});
// deactivate filter and check unfiltered user requests
cy.get('#swh-save-requests-user-filter')
.click({force: true});
cy.get('tbody tr').then(rows => {
expect(rows.length).to.eq(2);
});
});
it('should list unprivileged visit types when not connected', function() {
cy.visit(url);
cy.get('#swh-input-visit-type').children('option').then(options => {
const actual = [...options].map(o => o.value);
expect(actual).to.deep.eq(anonymousVisitTypes);
});
});
it('should list unprivileged visit types when connected as unprivileged user', function() {
cy.userLogin();
cy.visit(url);
cy.get('#swh-input-visit-type').children('option').then(options => {
const actual = [...options].map(o => o.value);
expect(actual).to.deep.eq(anonymousVisitTypes);
});
});
it('should list privileged visit types when connected as ambassador', function() {
cy.ambassadorLogin();
cy.visit(url);
cy.get('#swh-input-visit-type').children('option').then(options => {
const actual = [...options].map(o => o.value);
expect(actual).to.deep.eq(allVisitTypes);
});
});
it('should display extra inputs when dealing with \'archives\' visit type', function() {
cy.ambassadorLogin();
cy.visit(url);
for (const visitType of anonymousVisitTypes) {
cy.get('#swh-input-visit-type').select(visitType);
cy.get('.swh-save-origin-archives-form').should('not.be.visible');
}
// this should display more inputs with the 'archives' type
cy.get('#swh-input-visit-type').select('archives');
cy.get('.swh-save-origin-archives-form').should('be.visible');
});
it('should be allowed to submit \'archives\' save request when connected as ambassador', function() {
const originUrl = 'https://ftp.gnu.org/pub/pub/gnu/3dldf';
const artifactUrl = 'https://ftp.gnu.org/pub/pub/gnu/3dldf/3DLDF-1.1.4.tar.gz';
const artifactVersion = '1.1.4';
stubSaveRequest({
requestUrl: this.Urls.api_1_save_origin('archives', originUrl),
saveRequestStatus: 'accepted',
originUrl: originUrl,
saveTaskStatus: 'not yet scheduled'
});
cy.ambassadorLogin();
cy.visit(url);
// input new 'archives' information and submit
cy.get('#swh-input-origin-url')
.type(originUrl)
.get('#swh-input-visit-type')
.select('archives')
.get('#swh-input-artifact-url-0')
.type(artifactUrl)
.get('#swh-input-artifact-version-0')
.clear()
.type(artifactVersion)
.get('#swh-save-origin-form')
.submit();
cy.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
});
it('should submit multiple artifacts for the archives visit type', function() {
const originUrl = 'https://ftp.gnu.org/pub/pub/gnu/3dldf';
const artifactUrl = 'https://ftp.gnu.org/pub/pub/gnu/3dldf/3DLDF-1.1.4.tar.gz';
const artifactVersion = '1.1.4';
const artifact2Url = 'https://ftp.gnu.org/pub/pub/gnu/3dldf/3DLDF-1.1.5.tar.gz';
const artifact2Version = '1.1.5';
cy.ambassadorLogin();
cy.visit(url);
cy.get('#swh-input-origin-url')
.type(originUrl)
.get('#swh-input-visit-type')
.select('archives');
// fill first artifact info
cy.get('#swh-input-artifact-url-0')
.type(artifactUrl)
.get('#swh-input-artifact-version-0')
.clear()
.type(artifactVersion);
// add new artifact form row
cy.get('#swh-add-archive-artifact')
.click();
// check new row is displayed
cy.get('#swh-input-artifact-url-1')
.should('exist');
// request removal of newly added row
cy.get('#swh-remove-archive-artifact-1')
.click();
// check row has been removed
cy.get('#swh-input-artifact-url-1')
.should('not.exist');
// add new artifact form row
cy.get('#swh-add-archive-artifact')
.click();
// fill second artifact info
cy.get('#swh-input-artifact-url-1')
.type(artifact2Url)
.get('#swh-input-artifact-version-1')
.clear()
.type(artifact2Version);
// setup request interceptor to check POST data and stub response
cy.intercept('POST', this.Urls.api_1_save_origin('archives', originUrl), (req) => {
expect(req.body).to.deep.equal({
archives_data: [
{artifact_url: artifactUrl, artifact_version: artifactVersion},
{artifact_url: artifact2Url, artifact_version: artifact2Version}
]
});
req.reply(genOriginSaveResponse({
visitType: 'archives',
saveRequestStatus: 'accepted',
originUrl: originUrl,
saveRequestDate: new Date(),
saveTaskStatus: 'not yet scheduled',
visitDate: null,
visitStatus: null
}));
}).as('saveRequest');
// submit form
cy.get('#swh-save-origin-form')
.submit();
// submission should be successful
cy.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
});
it('should autofill artifact version when pasting artifact url', function() {
const originUrl = 'https://ftp.gnu.org/pub/pub/gnu/3dldf';
const artifactUrl = 'https://ftp.gnu.org/pub/pub/gnu/3dldf/3DLDF-1.1.4.tar.gz';
const artifactVersion = '3DLDF-1.1.4';
const artifact2Url = 'https://example.org/artifact/test/1.3.0.zip';
const artifact2Version = '1.3.0';
cy.ambassadorLogin();
cy.visit(url);
cy.get('#swh-input-origin-url')
.type(originUrl)
.get('#swh-input-visit-type')
.select('archives');
// fill first artifact info
cy.get('#swh-input-artifact-url-0')
.type(artifactUrl);
// check autofilled version
cy.get('#swh-input-artifact-version-0')
.should('have.value', artifactVersion);
// add new artifact form row
cy.get('#swh-add-archive-artifact')
.click();
// fill second artifact info
cy.get('#swh-input-artifact-url-1')
.type(artifact2Url);
// check autofilled version
cy.get('#swh-input-artifact-version-1')
.should('have.value', artifact2Version);
});
it('should use canonical URL for github repository to save', function() {
const ownerRepo = 'BIC-MNI/mni_autoreg';
const canonicalOriginUrl = 'https://github.com/BIC-MNI/mni_autoreg';
// stub call to github Web API fetching canonical repo URL
cy.intercept(`https://api.github.com/repos/${ownerRepo.toLowerCase()}`, (req) => {
req.reply({html_url: canonicalOriginUrl});
}).as('ghWebApiRequest');
// stub save request creation with canonical URL of github repo
cy.intercept('POST', this.Urls.api_1_save_origin('git', canonicalOriginUrl), (req) => {
req.reply(genOriginSaveResponse({
visitType: 'git',
saveRequestStatus: 'accepted',
originUrl: canonicalOriginUrl,
saveRequestDate: new Date(),
saveTaskStatus: 'not yet scheduled',
visitDate: null,
visitStatus: null
}));
}).as('saveRequest');
for (const originUrl of ['https://github.com/BiC-MnI/MnI_AuToReG',
'https://github.com/BiC-MnI/MnI_AuToReG.git',
'https://github.com/BiC-MnI/MnI_AuToReG/']) {
// enter non canonical URL of github repo
cy.get('#swh-input-origin-url')
.clear()
.type(originUrl);
// submit form
cy.get('#swh-save-origin-form')
.submit();
// submission should be successful
cy.wait('@ghWebApiRequest')
.wait('@saveRequest').then(() => {
checkAlertVisible('success', saveCodeMsg['success']);
});
}
});
});
diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py
index d088cb26..62d74e81 100644
--- a/swh/web/common/origin_save.py
+++ b/swh/web/common/origin_save.py
@@ -1,911 +1,918 @@
# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from bisect import bisect_right
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from itertools import product
import json
import logging
from typing import Any, Dict, List, Optional, Tuple
from prometheus_client import Gauge
import requests
import sentry_sdk
from django.core.exceptions import ObjectDoesNotExist, ValidationError
from django.core.validators import URLValidator
from django.db.models import Q, QuerySet
from django.utils.html import escape
from swh.scheduler.utils import create_oneshot_task_dict
from swh.web.common import archive
from swh.web.common.exc import BadInputExc, ForbiddenExc, NotFoundExc
from swh.web.common.models import (
SAVE_REQUEST_ACCEPTED,
SAVE_REQUEST_PENDING,
SAVE_REQUEST_REJECTED,
SAVE_TASK_FAILED,
SAVE_TASK_NOT_CREATED,
SAVE_TASK_NOT_YET_SCHEDULED,
SAVE_TASK_RUNNING,
SAVE_TASK_SCHEDULED,
SAVE_TASK_SUCCEEDED,
VISIT_STATUS_CREATED,
VISIT_STATUS_ONGOING,
SaveAuthorizedOrigin,
SaveOriginRequest,
SaveUnauthorizedOrigin,
)
from swh.web.common.origin_visits import get_origin_visits
from swh.web.common.typing import (
OriginExistenceCheckInfo,
OriginInfo,
SaveOriginRequestInfo,
)
from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc
from swh.web.config import get_config, scheduler
logger = logging.getLogger(__name__)
# Number of days in the past to lookup for information
MAX_THRESHOLD_DAYS = 30
# Non terminal visit statuses which needs updates
NON_TERMINAL_STATUSES = [
VISIT_STATUS_CREATED,
VISIT_STATUS_ONGOING,
]
def get_origin_save_authorized_urls() -> List[str]:
"""
Get the list of origin url prefixes authorized to be
immediately loaded into the archive (whitelist).
Returns:
list: The list of authorized origin url prefix
"""
return [origin.url for origin in SaveAuthorizedOrigin.objects.all()]
def get_origin_save_unauthorized_urls() -> List[str]:
"""
Get the list of origin url prefixes forbidden to be
loaded into the archive (blacklist).
Returns:
list: the list of unauthorized origin url prefix
"""
return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()]
def can_save_origin(origin_url: str, bypass_pending_review: bool = False) -> str:
"""
Check if a software origin can be saved into the archive.
Based on the origin url, the save request will be either:
* immediately accepted if the url is whitelisted
* rejected if the url is blacklisted
* put in pending state for manual review otherwise
Args:
origin_url (str): the software origin url to check
Returns:
str: the origin save request status, either **accepted**,
**rejected** or **pending**
"""
# origin url may be blacklisted
for url_prefix in get_origin_save_unauthorized_urls():
if origin_url.startswith(url_prefix):
return SAVE_REQUEST_REJECTED
# if the origin url is in the white list, it can be immediately saved
for url_prefix in get_origin_save_authorized_urls():
if origin_url.startswith(url_prefix):
return SAVE_REQUEST_ACCEPTED
# otherwise, the origin url needs to be manually verified if the user
# that submitted it does not have special permission
if bypass_pending_review:
# mark the origin URL as trusted in that case
SaveAuthorizedOrigin.objects.get_or_create(url=origin_url)
return SAVE_REQUEST_ACCEPTED
else:
return SAVE_REQUEST_PENDING
# map visit type to scheduler task
# TODO: do not hardcode the task name here (T1157)
-_visit_type_task = {"git": "load-git", "hg": "load-hg", "svn": "load-svn"}
+_visit_type_task = {
+ "git": "load-git",
+ "hg": "load-hg",
+ "svn": "load-svn",
+ "cvs": "load-cvs",
+}
_visit_type_task_privileged = {
"archives": "load-archive-files",
}
# map scheduler task status to origin save status
_save_task_status = {
"next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED,
"next_run_scheduled": SAVE_TASK_SCHEDULED,
"completed": SAVE_TASK_SUCCEEDED,
"disabled": SAVE_TASK_FAILED,
}
# map scheduler task_run status to origin save status
_save_task_run_status = {
"scheduled": SAVE_TASK_SCHEDULED,
"started": SAVE_TASK_RUNNING,
"eventful": SAVE_TASK_SUCCEEDED,
"uneventful": SAVE_TASK_SUCCEEDED,
"failed": SAVE_TASK_FAILED,
"permfailed": SAVE_TASK_FAILED,
"lost": SAVE_TASK_FAILED,
}
@lru_cache()
def get_scheduler_load_task_types() -> List[str]:
task_types = scheduler().get_task_types()
return [t["type"] for t in task_types if t["type"].startswith("load")]
def get_savable_visit_types_dict(privileged_user: bool = False) -> Dict:
"""Returned the supported task types the user has access to.
Args:
privileged_user: Flag to determine if all visit types should be returned or not.
Default to False to only list unprivileged visit types.
Returns:
the dict of supported visit types for the user
"""
if privileged_user:
task_types = {**_visit_type_task, **_visit_type_task_privileged}
else:
task_types = _visit_type_task
# filter visit types according to scheduler load task types if available
try:
load_task_types = get_scheduler_load_task_types()
return {k: v for k, v in task_types.items() if v in load_task_types}
except Exception:
return task_types
def get_savable_visit_types(privileged_user: bool = False) -> List[str]:
"""Return the list of visit types the user can perform save requests on.
Args:
privileged_user: Flag to determine if all visit types should be returned or not.
Default to False to only list unprivileged visit types.
Returns:
the list of saveable visit types
"""
return sorted(list(get_savable_visit_types_dict(privileged_user).keys()))
def _check_visit_type_savable(visit_type: str, privileged_user: bool = False) -> None:
visit_type_tasks = get_savable_visit_types(privileged_user)
if visit_type not in visit_type_tasks:
allowed_visit_types = ", ".join(visit_type_tasks)
raise BadInputExc(
f"Visit of type {visit_type} can not be saved! "
f"Allowed types are the following: {allowed_visit_types}"
)
-_validate_url = URLValidator(schemes=["http", "https", "svn", "git"])
+_validate_url = URLValidator(
+ schemes=["http", "https", "svn", "git", "rsync", "pserver", "ssh"]
+)
def _check_origin_url_valid(origin_url: str) -> None:
try:
_validate_url(origin_url)
except ValidationError:
raise BadInputExc(
"The provided origin url (%s) is not valid!" % escape(origin_url)
)
def origin_exists(origin_url: str) -> OriginExistenceCheckInfo:
"""Check the origin url for existence. If it exists, extract some more useful
information on the origin.
"""
resp = requests.head(origin_url, allow_redirects=True)
exists = resp.ok
content_length: Optional[int] = None
last_modified: Optional[str] = None
if exists:
# Also process X-Archive-Orig-* headers in case the URL targets the
# Internet Archive.
size_ = resp.headers.get(
"Content-Length", resp.headers.get("X-Archive-Orig-Content-Length")
)
content_length = int(size_) if size_ else None
try:
date_str = resp.headers.get(
"Last-Modified", resp.headers.get("X-Archive-Orig-Last-Modified", "")
)
date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z")
last_modified = date.isoformat()
except ValueError:
# if not provided or not parsable as per the expected format, keep it None
pass
return OriginExistenceCheckInfo(
origin_url=origin_url,
exists=exists,
last_modified=last_modified,
content_length=content_length,
)
def _check_origin_exists(url: str) -> OriginExistenceCheckInfo:
"""Ensure an URL exists, if not raise an explicit message."""
metadata = origin_exists(url)
if not metadata["exists"]:
raise BadInputExc(f"The provided url ({escape(url)}) does not exist!")
return metadata
def _get_visit_info_for_save_request(
save_request: SaveOriginRequest,
) -> Tuple[Optional[datetime], Optional[str]]:
"""Retrieve visit information out of a save request
Args:
save_request: Input save origin request to retrieve information for.
Returns:
Tuple of (visit date, optional visit status) for such save request origin
"""
visit_date = None
visit_status = None
time_now = datetime.now(tz=timezone.utc)
time_delta = time_now - save_request.request_date
# stop trying to find a visit date one month after save request submission
# as those requests to storage are expensive and associated loading task
# surely ended up with errors
if time_delta.days <= MAX_THRESHOLD_DAYS:
try:
origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url))
origin_visits = get_origin_visits(origin_info)
visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits]
i = bisect_right(visit_dates, save_request.request_date)
if i != len(visit_dates):
visit_date = visit_dates[i]
visit_status = origin_visits[i]["status"]
except Exception as exc:
sentry_sdk.capture_exception(exc)
return visit_date, visit_status
def _check_visit_update_status(
save_request: SaveOriginRequest,
) -> Tuple[Optional[datetime], Optional[str], Optional[str]]:
"""Given a save request, determine whether a save request was successful or failed.
Args:
save_request: Input save origin request to retrieve information for.
Returns:
Tuple of (optional visit date, optional visit status, optional save task status)
for such save request origin
"""
visit_date, visit_status = _get_visit_info_for_save_request(save_request)
loading_task_status = None
if visit_date and visit_status in ("full", "partial"):
# visit has been performed, mark the saving task as succeeded
loading_task_status = SAVE_TASK_SUCCEEDED
elif visit_status in ("created", "ongoing"):
# visit is currently running
loading_task_status = SAVE_TASK_RUNNING
elif visit_status in ("not_found", "failed"):
loading_task_status = SAVE_TASK_FAILED
else:
time_now = datetime.now(tz=timezone.utc)
time_delta = time_now - save_request.request_date
# consider the task as failed if it is still in scheduled state
# 30 days after its submission
if time_delta.days > MAX_THRESHOLD_DAYS:
loading_task_status = SAVE_TASK_FAILED
return visit_date, visit_status, loading_task_status
def _compute_task_loading_status(
task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None,
) -> Optional[str]:
loading_task_status: Optional[str] = None
# First determine the loading task status out of task information
if task:
loading_task_status = _save_task_status[task["status"]]
if task_run:
loading_task_status = _save_task_run_status[task_run["status"]]
return loading_task_status
def _update_save_request_info(
save_request: SaveOriginRequest,
task: Optional[Dict[str, Any]] = None,
task_run: Optional[Dict[str, Any]] = None,
) -> SaveOriginRequestInfo:
"""Update save request information out of the visit status and fallback to the task and
task_run information if the visit status is missing.
Args:
save_request: Save request
task: Associated scheduler task information about the save request
task_run: Most recent run occurrence of the associated task
Returns:
Summary of the save request information updated.
"""
must_save = False
# To determine the save code now request's final status, the visit date must be set
# and the visit status must be a final one. Once they do, the save code now is
# definitely done.
if (
not save_request.visit_date
or not save_request.visit_status
or save_request.visit_status in NON_TERMINAL_STATUSES
):
visit_date, visit_status, loading_task_status = _check_visit_update_status(
save_request
)
if not loading_task_status: # fallback when not provided
loading_task_status = _compute_task_loading_status(task, task_run)
if visit_date != save_request.visit_date:
must_save = True
save_request.visit_date = visit_date
if visit_status != save_request.visit_status:
must_save = True
save_request.visit_status = visit_status
if (
loading_task_status is not None
and loading_task_status != save_request.loading_task_status
):
must_save = True
save_request.loading_task_status = loading_task_status
if must_save:
save_request.save()
return save_request.to_dict()
def create_save_origin_request(
visit_type: str,
origin_url: str,
privileged_user: bool = False,
user_id: Optional[int] = None,
**kwargs,
) -> SaveOriginRequestInfo:
"""Create a loading task to save a software origin into the archive.
This function aims to create a software origin loading task trough the use of the
swh-scheduler component.
First, some checks are performed to see if the visit type and origin url are valid
but also if the the save request can be accepted. For the 'archives' visit type,
this also ensures the artifacts actually exists. If those checks passed, the loading
task is then created. Otherwise, the save request is put in pending or rejected
state.
All the submitted save requests are logged into the swh-web database to keep track
of them.
Args:
visit_type: the type of visit to perform (e.g. git, hg, svn, archives, ...)
origin_url: the url of the origin to save
privileged: Whether the user has some more privilege than other (bypass
review, access to privileged other visit types)
user_id: User identifier (provided when authenticated)
kwargs: Optional parameters (e.g. artifact_url, artifact_filename,
artifact_version)
Raises:
BadInputExc: the visit type or origin url is invalid or inexistent
ForbiddenExc: the provided origin url is blacklisted
Returns:
dict: A dict describing the save request with the following keys:
* **visit_type**: the type of visit to perform
* **origin_url**: the url of the origin
* **save_request_date**: the date the request was submitted
* **save_request_status**: the request status, either **accepted**,
**rejected** or **pending**
* **save_task_status**: the origin loading task status, either
**not created**, **not yet scheduled**, **scheduled**,
**succeed** or **failed**
"""
visit_type_tasks = get_savable_visit_types_dict(privileged_user)
_check_visit_type_savable(visit_type, privileged_user)
_check_origin_url_valid(origin_url)
# if all checks passed so far, we can try and save the origin
save_request_status = can_save_origin(origin_url, privileged_user)
task = None
# if the origin save request is accepted, create a scheduler
# task to load it into the archive
if save_request_status == SAVE_REQUEST_ACCEPTED:
# create a task with high priority
task_kwargs: Dict[str, Any] = {
"priority": "high",
"url": origin_url,
}
if visit_type == "archives":
# extra arguments for that type are required
archives_data = kwargs.get("archives_data", [])
if not archives_data:
raise BadInputExc(
"Artifacts data are missing for the archives visit type."
)
artifacts = []
for artifact in archives_data:
artifact_url = artifact.get("artifact_url")
artifact_version = artifact.get("artifact_version")
if not artifact_url or not artifact_version:
raise BadInputExc("Missing url or version for an artifact to load.")
metadata = _check_origin_exists(artifact_url)
artifacts.append(
{
"url": artifact_url,
"version": artifact_version,
"time": metadata["last_modified"],
"length": metadata["content_length"],
}
)
task_kwargs = dict(**task_kwargs, artifacts=artifacts, snapshot_append=True)
sor = None
# get list of previously submitted save requests (most recent first)
current_sors = list(
SaveOriginRequest.objects.filter(
visit_type=visit_type, origin_url=origin_url
).order_by("-request_date")
)
can_create_task = False
# if no save requests previously submitted, create the scheduler task
if not current_sors:
can_create_task = True
else:
# get the latest submitted save request
sor = current_sors[0]
# if it was in pending state, we need to create the scheduler task
# and update the save request info in the database
if sor.status == SAVE_REQUEST_PENDING:
can_create_task = True
# a task has already been created to load the origin
elif sor.loading_task_id != -1:
# get the scheduler task and its status
tasks = scheduler().get_tasks([sor.loading_task_id])
task = tasks[0] if tasks else None
task_runs = scheduler().get_task_runs([sor.loading_task_id])
task_run = task_runs[0] if task_runs else None
save_request_info = _update_save_request_info(sor, task, task_run)
task_status = save_request_info["save_task_status"]
# create a new scheduler task only if the previous one has been
# already executed
if (
task_status == SAVE_TASK_FAILED
or task_status == SAVE_TASK_SUCCEEDED
):
can_create_task = True
sor = None
else:
can_create_task = False
if can_create_task:
# effectively create the scheduler task
task_dict = create_oneshot_task_dict(
visit_type_tasks[visit_type], **task_kwargs
)
task = scheduler().create_tasks([task_dict])[0]
# pending save request has been accepted
if sor:
sor.status = SAVE_REQUEST_ACCEPTED
sor.loading_task_id = task["id"]
sor.save()
else:
sor = SaveOriginRequest.objects.create(
visit_type=visit_type,
origin_url=origin_url,
status=save_request_status,
loading_task_id=task["id"],
user_ids=f'"{user_id}"' if user_id else None,
)
# save request must be manually reviewed for acceptation
elif save_request_status == SAVE_REQUEST_PENDING:
# check if there is already such a save request already submitted,
# no need to add it to the database in that case
try:
sor = SaveOriginRequest.objects.get(
visit_type=visit_type, origin_url=origin_url, status=save_request_status
)
user_ids = sor.user_ids if sor.user_ids is not None else ""
if user_id is not None and f'"{user_id}"' not in user_ids:
# update user ids list
sor.user_ids = f'{sor.user_ids},"{user_id}"'
sor.save()
# if not add it to the database
except ObjectDoesNotExist:
sor = SaveOriginRequest.objects.create(
visit_type=visit_type,
origin_url=origin_url,
status=save_request_status,
user_ids=f'"{user_id}"' if user_id else None,
)
# origin can not be saved as its url is blacklisted,
# log the request to the database anyway
else:
sor = SaveOriginRequest.objects.create(
visit_type=visit_type,
origin_url=origin_url,
status=save_request_status,
user_ids=f'"{user_id}"' if user_id else None,
)
if save_request_status == SAVE_REQUEST_REJECTED:
raise ForbiddenExc(
(
'The "save code now" request has been rejected '
"because the provided origin url is blacklisted."
)
)
assert sor is not None
return _update_save_request_info(sor, task)
def update_save_origin_requests_from_queryset(
requests_queryset: QuerySet,
) -> List[SaveOriginRequestInfo]:
"""Update all save requests from a SaveOriginRequest queryset, update their status in db
and return the list of impacted save_requests.
Args:
requests_queryset: input SaveOriginRequest queryset
Returns:
list: A list of save origin request info dicts as described in
:func:`swh.web.common.origin_save.create_save_origin_request`
"""
task_ids = []
for sor in requests_queryset:
task_ids.append(sor.loading_task_id)
save_requests = []
if task_ids:
tasks = scheduler().get_tasks(task_ids)
tasks = {task["id"]: task for task in tasks}
task_runs = scheduler().get_task_runs(tasks)
task_runs = {task_run["task"]: task_run for task_run in task_runs}
for sor in requests_queryset:
sr_dict = _update_save_request_info(
sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id),
)
save_requests.append(sr_dict)
return save_requests
def refresh_save_origin_request_statuses() -> List[SaveOriginRequestInfo]:
"""Refresh non-terminal save origin requests (SOR) in the backend.
Non-terminal SOR are requests whose status is **accepted** and their task status are
either **created**, **not yet scheduled**, **scheduled** or **running**.
This shall compute this list of SOR, checks their status in the scheduler and
optionally elasticsearch for their current status. Then update those in db.
Finally, this returns the refreshed information on those SOR.
"""
pivot_date = datetime.now(tz=timezone.utc) - timedelta(days=MAX_THRESHOLD_DAYS)
save_requests = SaveOriginRequest.objects.filter(
# Retrieve accepted request statuses (all statuses)
Q(status=SAVE_REQUEST_ACCEPTED),
# those without the required information we need to update
Q(visit_date__isnull=True)
| Q(visit_status__isnull=True)
| Q(visit_status__in=NON_TERMINAL_STATUSES),
# limit results to recent ones (that is roughly 30 days old at best)
Q(request_date__gte=pivot_date),
)
return (
update_save_origin_requests_from_queryset(save_requests)
if save_requests.count() > 0
else []
)
def get_save_origin_requests(
visit_type: str, origin_url: str
) -> List[SaveOriginRequestInfo]:
"""
Get all save requests for a given software origin.
Args:
visit_type: the type of visit
origin_url: the url of the origin
Raises:
BadInputExc: the visit type or origin url is invalid
swh.web.common.exc.NotFoundExc: no save requests can be found for the
given origin
Returns:
list: A list of save origin requests dict as described in
:func:`swh.web.common.origin_save.create_save_origin_request`
"""
_check_visit_type_savable(visit_type)
_check_origin_url_valid(origin_url)
sors = SaveOriginRequest.objects.filter(
visit_type=visit_type, origin_url=origin_url
)
if sors.count() == 0:
raise NotFoundExc(
f"No save requests found for visit of type {visit_type} "
f"on origin with url {origin_url}."
)
return update_save_origin_requests_from_queryset(sors)
def get_save_origin_task_info(
save_request_id: int, full_info: bool = True
) -> Dict[str, Any]:
"""
Get detailed information about an accepted save origin request
and its associated loading task.
If the associated loading task info is archived and removed
from the scheduler database, returns an empty dictionary.
Args:
save_request_id: identifier of a save origin request
full_info: whether to return detailed info for staff users
Returns:
A dictionary with the following keys:
- **type**: loading task type
- **arguments**: loading task arguments
- **id**: loading task database identifier
- **backend_id**: loading task celery identifier
- **scheduled**: loading task scheduling date
- **ended**: loading task termination date
- **status**: loading task execution status
- **visit_status**: Actual visit status
Depending on the availability of the task logs in the elasticsearch
cluster of Software Heritage, the returned dictionary may also
contain the following keys:
- **name**: associated celery task name
- **message**: relevant log message from task execution
- **duration**: task execution time (only if it succeeded)
- **worker**: name of the worker that executed the task
"""
try:
save_request = SaveOriginRequest.objects.get(id=save_request_id)
except ObjectDoesNotExist:
return {}
task = scheduler().get_tasks([save_request.loading_task_id])
task = task[0] if task else None
if task is None:
return {}
task_run = scheduler().get_task_runs([task["id"]])
task_run = task_run[0] if task_run else None
if task_run is None:
return {}
task_run["type"] = task["type"]
task_run["arguments"] = task["arguments"]
task_run["id"] = task_run["task"]
del task_run["task"]
del task_run["metadata"]
# Enrich the task run with the loading visit status
task_run["visit_status"] = save_request.visit_status
es_workers_index_url = get_config()["es_workers_index_url"]
if not es_workers_index_url:
return task_run
es_workers_index_url += "/_search"
if save_request.visit_date:
min_ts = save_request.visit_date
max_ts = min_ts + timedelta(days=7)
else:
min_ts = save_request.request_date
max_ts = min_ts + timedelta(days=MAX_THRESHOLD_DAYS)
min_ts_unix = int(min_ts.timestamp()) * 1000
max_ts_unix = int(max_ts.timestamp()) * 1000
save_task_status = _save_task_status[task["status"]]
priority = "3" if save_task_status == SAVE_TASK_FAILED else "6"
query = {
"bool": {
"must": [
{"match_phrase": {"priority": {"query": priority}}},
{"match_phrase": {"swh_task_id": {"query": task_run["backend_id"]}}},
{
"range": {
"@timestamp": {
"gte": min_ts_unix,
"lte": max_ts_unix,
"format": "epoch_millis",
}
}
},
]
}
}
try:
response = requests.post(
es_workers_index_url,
json={"query": query, "sort": ["@timestamp"]},
timeout=30,
)
results = json.loads(response.text)
if results["hits"]["total"]["value"] >= 1:
task_run_info = results["hits"]["hits"][-1]["_source"]
if "swh_logging_args_runtime" in task_run_info:
duration = task_run_info["swh_logging_args_runtime"]
task_run["duration"] = duration
if "message" in task_run_info:
task_run["message"] = task_run_info["message"]
if "swh_logging_args_name" in task_run_info:
task_run["name"] = task_run_info["swh_logging_args_name"]
elif "swh_task_name" in task_run_info:
task_run["name"] = task_run_info["swh_task_name"]
if "hostname" in task_run_info:
task_run["worker"] = task_run_info["hostname"]
elif "host" in task_run_info:
task_run["worker"] = task_run_info["host"]
except Exception as exc:
logger.warning("Request to Elasticsearch failed\n%s", exc)
sentry_sdk.capture_exception(exc)
if not full_info:
for field in ("id", "backend_id", "worker"):
# remove some staff only fields
task_run.pop(field, None)
if "message" in task_run and "Loading failure" in task_run["message"]:
# hide traceback for non staff users, only display exception
message_lines = task_run["message"].split("\n")
message = ""
for line in message_lines:
if line.startswith("Traceback"):
break
message += f"{line}\n"
message += message_lines[-1]
task_run["message"] = message
return task_run
SUBMITTED_SAVE_REQUESTS_METRIC = "swh_web_submitted_save_requests"
_submitted_save_requests_gauge = Gauge(
name=SUBMITTED_SAVE_REQUESTS_METRIC,
documentation="Number of submitted origin save requests",
labelnames=["status", "visit_type"],
registry=SWH_WEB_METRICS_REGISTRY,
)
ACCEPTED_SAVE_REQUESTS_METRIC = "swh_web_accepted_save_requests"
_accepted_save_requests_gauge = Gauge(
name=ACCEPTED_SAVE_REQUESTS_METRIC,
documentation="Number of accepted origin save requests",
labelnames=["load_task_status", "visit_type"],
registry=SWH_WEB_METRICS_REGISTRY,
)
# Metric on the delay of save code now request per status and visit_type. This is the
# time difference between the save code now is requested and the time it got ingested.
ACCEPTED_SAVE_REQUESTS_DELAY_METRIC = "swh_web_save_requests_delay_seconds"
_accepted_save_requests_delay_gauge = Gauge(
name=ACCEPTED_SAVE_REQUESTS_DELAY_METRIC,
documentation="Save Requests Duration",
labelnames=["load_task_status", "visit_type"],
registry=SWH_WEB_METRICS_REGISTRY,
)
def compute_save_requests_metrics() -> None:
"""Compute Prometheus metrics related to origin save requests:
- Number of submitted origin save requests
- Number of accepted origin save requests
- Save Code Now requests delay between request time and actual time of ingestion
"""
request_statuses = (
SAVE_REQUEST_ACCEPTED,
SAVE_REQUEST_REJECTED,
SAVE_REQUEST_PENDING,
)
load_task_statuses = (
SAVE_TASK_NOT_CREATED,
SAVE_TASK_NOT_YET_SCHEDULED,
SAVE_TASK_SCHEDULED,
SAVE_TASK_SUCCEEDED,
SAVE_TASK_FAILED,
SAVE_TASK_RUNNING,
)
# for metrics, we want access to all visit types
visit_types = get_savable_visit_types(privileged_user=True)
labels_set = product(request_statuses, visit_types)
for labels in labels_set:
_submitted_save_requests_gauge.labels(*labels).set(0)
labels_set = product(load_task_statuses, visit_types)
for labels in labels_set:
_accepted_save_requests_gauge.labels(*labels).set(0)
duration_load_task_statuses = (
SAVE_TASK_FAILED,
SAVE_TASK_SUCCEEDED,
)
for labels in product(duration_load_task_statuses, visit_types):
_accepted_save_requests_delay_gauge.labels(*labels).set(0)
for sor in SaveOriginRequest.objects.all():
if sor.status == SAVE_REQUEST_ACCEPTED:
_accepted_save_requests_gauge.labels(
load_task_status=sor.loading_task_status, visit_type=sor.visit_type,
).inc()
_submitted_save_requests_gauge.labels(
status=sor.status, visit_type=sor.visit_type
).inc()
if (
sor.loading_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED)
and sor.visit_date is not None
and sor.request_date is not None
):
delay = sor.visit_date.timestamp() - sor.request_date.timestamp()
_accepted_save_requests_delay_gauge.labels(
load_task_status=sor.loading_task_status, visit_type=sor.visit_type,
).inc(delay)
diff --git a/swh/web/templates/misc/origin-save.html b/swh/web/templates/misc/origin-save.html
index 60fa8d72..7d676721 100644
--- a/swh/web/templates/misc/origin-save.html
+++ b/swh/web/templates/misc/origin-save.html
@@ -1,141 +1,144 @@
{% extends "../layout.html" %}
{% comment %}
Copyright (C) 2018-2021 The Software Heritage developers
See the AUTHORS file at the top-level directory of this distribution
License: GNU Affero General Public License version 3, or any later version
See top-level LICENSE file for more information
{% endcomment %}
{% load render_bundle from webpack_loader %}
{% load static %}
{% block title %}{{ heading }} – Software Heritage archive{% endblock %}
{% block header %}
{% render_bundle 'save' %}
{% endblock %}
{% block navbar-content %}
Save code now
{% endblock %}
{% block content %}
You can contribute to extend the content of the Software Heritage archive by submitting an origin
save request. To do so, fill the required info in the form below:
Origin url: the url of the remote repository for the software origin.
In order to avoid saving errors from Software Heritage, you should provide the clone/checkout url
as given by the provider hosting the software origin. It can easily be found in the
web interface used to browse the software origin. For instance, if you want to save a git
origin into the archive, you should check that the command $ git clone <origin_url>
does not return an error before submitting a request.
Once submitted, your save request can either be:
accepted: a visit to the provided origin will then be scheduled by Software Heritage in order to
load its content into the archive as soon as possible
rejected: the provided origin url is blacklisted and no visit will be scheduled
put in pending state: a manual review will then be performed in order to determine if the
origin can be safely loaded or not into the archive
Once a save request has been accepted, you can follow its current status in the
submitted save requests list.
If you submitted requests while authenticated, you will be able
to only display your own requests.
Date
Type
Url
Request
Status
Info
{% endblock %}
diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py
index fead2e30..5b0ef04f 100644
--- a/swh/web/tests/conftest.py
+++ b/swh/web/tests/conftest.py
@@ -1,482 +1,497 @@
# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import timedelta
import json
import os
import shutil
from subprocess import PIPE, run
import sys
from typing import Any, Dict, List, Optional
from _pytest.python import Function
from hypothesis import HealthCheck, settings
import pytest
from django.core.cache import cache
from rest_framework.test import APIClient, APIRequestFactory
from swh.model.hashutil import ALGORITHMS, hash_to_bytes
from swh.scheduler.tests.common import TASK_TYPES
from swh.storage.algos.origin import origin_get_latest_visit_status
from swh.storage.algos.snapshot import snapshot_get_all_branches, snapshot_get_latest
from swh.web.auth.utils import OIDC_SWH_WEB_CLIENT_ID
from swh.web.common import converters
from swh.web.common.origin_save import get_scheduler_load_task_types
from swh.web.common.typing import OriginVisitInfo
from swh.web.config import get_config
from swh.web.tests.data import get_tests_data, override_storages
# Used to skip some tests
ctags_json_missing = (
shutil.which("ctags") is None
or b"+json" not in run(["ctags", "--version"], stdout=PIPE).stdout
)
fossology_missing = shutil.which("nomossa") is None
# Register some hypothesis profiles
settings.register_profile("default", settings())
# we use getattr here to keep mypy happy regardless hypothesis version
function_scoped_fixture_check = (
[getattr(HealthCheck, "function_scoped_fixture")]
if hasattr(HealthCheck, "function_scoped_fixture")
else []
)
suppress_health_check = [
HealthCheck.too_slow,
HealthCheck.filter_too_much,
] + function_scoped_fixture_check
settings.register_profile(
"swh-web", settings(deadline=None, suppress_health_check=suppress_health_check,),
)
settings.register_profile(
"swh-web-fast",
settings(
deadline=None, max_examples=5, suppress_health_check=suppress_health_check,
),
)
def pytest_configure(config):
# Use fast hypothesis profile by default if none has been
# explicitly specified in pytest option
if config.getoption("--hypothesis-profile") is None:
settings.load_profile("swh-web-fast")
# Small hack in order to be able to run the unit tests
# without static assets generated by webpack.
# Those assets are not really needed for the Python tests
# but the django templates will fail to load due to missing
# generated file webpack-stats.json describing the js and css
# files to include.
# So generate a dummy webpack-stats.json file to overcome
# that issue.
test_dir = os.path.dirname(__file__)
# location of the static folder when running tests through tox
data_dir = os.path.join(sys.prefix, "share/swh/web")
static_dir = os.path.join(data_dir, "static")
if not os.path.exists(static_dir):
# location of the static folder when running tests locally with pytest
static_dir = os.path.join(test_dir, "../../../static")
webpack_stats = os.path.join(static_dir, "webpack-stats.json")
if os.path.exists(webpack_stats):
return
bundles_dir = os.path.join(test_dir, "../../../assets/src/bundles")
if not os.path.exists(bundles_dir):
# location of the bundles folder when running tests with tox
bundles_dir = os.path.join(data_dir, "assets/src/bundles")
_, bundles, _ = next(os.walk(bundles_dir))
mock_webpack_stats = {
"status": "done",
"publicPath": "/static",
"chunks": {},
"assets": {},
}
for bundle in bundles:
asset = f"js/{bundle}.js"
mock_webpack_stats["chunks"][bundle] = [asset]
mock_webpack_stats["assets"][asset] = {
"name": asset,
"publicPath": f"/static/{asset}",
}
with open(webpack_stats, "w") as outfile:
json.dump(mock_webpack_stats, outfile)
# Clear Django cache before each test
@pytest.fixture(autouse=True)
def django_cache_cleared():
cache.clear()
# Alias rf fixture from pytest-django
@pytest.fixture
def request_factory(rf):
return rf
# Fixture to get test client from Django REST Framework
@pytest.fixture
def api_client():
return APIClient()
# Fixture to get API request factory from Django REST Framework
@pytest.fixture
def api_request_factory():
return APIRequestFactory()
# Initialize tests data
@pytest.fixture(scope="function", autouse=True)
def tests_data():
data = get_tests_data(reset=True)
# Update swh-web configuration to use the in-memory storages
# instantiated in the tests.data module
override_storages(
data["storage"], data["idx_storage"], data["search"], data["counters"]
)
return data
# Fixture to manipulate data from a sample archive used in the tests
@pytest.fixture(scope="function")
def archive_data(tests_data):
return _ArchiveData(tests_data)
# Fixture to manipulate indexer data from a sample archive used in the tests
@pytest.fixture(scope="function")
def indexer_data(tests_data):
return _IndexerData(tests_data)
# Custom data directory for requests_mock
@pytest.fixture
def datadir():
return os.path.join(os.path.abspath(os.path.dirname(__file__)), "resources")
class _ArchiveData:
"""
Helper class to manage data from a sample test archive.
It is initialized with a reference to an in-memory storage
containing raw tests data.
It is basically a proxy to Storage interface but it overrides some methods
to retrieve those tests data in a json serializable format in order to ease
tests implementation.
"""
def __init__(self, tests_data):
self.storage = tests_data["storage"]
def __getattr__(self, key):
if key == "storage":
raise AttributeError(key)
# Forward calls to non overridden Storage methods to wrapped
# storage instance
return getattr(self.storage, key)
def content_find(self, content: Dict[str, Any]) -> Dict[str, Any]:
cnt_ids_bytes = {
algo_hash: hash_to_bytes(content[algo_hash])
for algo_hash in ALGORITHMS
if content.get(algo_hash)
}
cnt = self.storage.content_find(cnt_ids_bytes)
return converters.from_content(cnt[0].to_dict()) if cnt else cnt
def content_get(self, cnt_id: str) -> Dict[str, Any]:
cnt_id_bytes = hash_to_bytes(cnt_id)
content = self.storage.content_get([cnt_id_bytes])[0]
if content:
content_d = content.to_dict()
content_d.pop("ctime", None)
else:
content_d = None
return converters.from_swh(
content_d, hashess={"sha1", "sha1_git", "sha256", "blake2s256"}
)
def content_get_data(self, cnt_id: str) -> Optional[Dict[str, Any]]:
cnt_id_bytes = hash_to_bytes(cnt_id)
cnt_data = self.storage.content_get_data(cnt_id_bytes)
if cnt_data is None:
return None
return converters.from_content({"data": cnt_data, "sha1": cnt_id_bytes})
def directory_get(self, dir_id):
return {"id": dir_id, "content": self.directory_ls(dir_id)}
def directory_ls(self, dir_id):
cnt_id_bytes = hash_to_bytes(dir_id)
dir_content = map(
converters.from_directory_entry, self.storage.directory_ls(cnt_id_bytes)
)
return list(dir_content)
def release_get(self, rel_id: str) -> Optional[Dict[str, Any]]:
rel_id_bytes = hash_to_bytes(rel_id)
rel_data = self.storage.release_get([rel_id_bytes])[0]
return converters.from_release(rel_data) if rel_data else None
def revision_get(self, rev_id: str) -> Optional[Dict[str, Any]]:
rev_id_bytes = hash_to_bytes(rev_id)
rev_data = self.storage.revision_get([rev_id_bytes])[0]
return converters.from_revision(rev_data) if rev_data else None
def revision_log(self, rev_id, limit=None):
rev_id_bytes = hash_to_bytes(rev_id)
return list(
map(
converters.from_revision,
self.storage.revision_log([rev_id_bytes], limit=limit),
)
)
def snapshot_get_latest(self, origin_url):
snp = snapshot_get_latest(self.storage, origin_url)
return converters.from_snapshot(snp.to_dict())
def origin_get(self, origin_urls):
origins = self.storage.origin_get(origin_urls)
return [converters.from_origin(o.to_dict()) for o in origins]
def origin_visit_get(self, origin_url):
next_page_token = None
visits = []
while True:
visit_page = self.storage.origin_visit_get(
origin_url, page_token=next_page_token
)
next_page_token = visit_page.next_page_token
for visit in visit_page.results:
visit_status = self.storage.origin_visit_status_get_latest(
origin_url, visit.visit
)
visits.append(
converters.from_origin_visit(
{**visit_status.to_dict(), "type": visit.type}
)
)
if not next_page_token:
break
return visits
def origin_visit_get_by(self, origin_url: str, visit_id: int) -> OriginVisitInfo:
visit = self.storage.origin_visit_get_by(origin_url, visit_id)
assert visit is not None
visit_status = self.storage.origin_visit_status_get_latest(origin_url, visit_id)
assert visit_status is not None
return converters.from_origin_visit(
{**visit_status.to_dict(), "type": visit.type}
)
def origin_visit_status_get_latest(
self,
origin_url,
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
):
visit_status = origin_get_latest_visit_status(
self.storage,
origin_url,
type=type,
allowed_statuses=allowed_statuses,
require_snapshot=require_snapshot,
)
return (
converters.from_origin_visit(visit_status.to_dict())
if visit_status
else None
)
def snapshot_get(self, snapshot_id):
snp = snapshot_get_all_branches(self.storage, hash_to_bytes(snapshot_id))
return converters.from_snapshot(snp.to_dict())
def snapshot_get_branches(
self, snapshot_id, branches_from="", branches_count=1000, target_types=None
):
partial_branches = self.storage.snapshot_get_branches(
hash_to_bytes(snapshot_id),
branches_from.encode(),
branches_count,
target_types,
)
return converters.from_partial_branches(partial_branches)
def snapshot_get_head(self, snapshot):
if snapshot["branches"]["HEAD"]["target_type"] == "alias":
target = snapshot["branches"]["HEAD"]["target"]
head = snapshot["branches"][target]["target"]
else:
head = snapshot["branches"]["HEAD"]["target"]
return head
def snapshot_count_branches(self, snapshot_id):
counts = dict.fromkeys(("alias", "release", "revision"), 0)
counts.update(self.storage.snapshot_count_branches(hash_to_bytes(snapshot_id)))
counts.pop(None, None)
return counts
class _IndexerData:
"""
Helper class to manage indexer tests data
It is initialized with a reference to an in-memory indexer storage
containing raw tests data.
It also defines class methods to retrieve those tests data in
a json serializable format in order to ease tests implementation.
"""
def __init__(self, tests_data):
self.idx_storage = tests_data["idx_storage"]
self.mimetype_indexer = tests_data["mimetype_indexer"]
self.license_indexer = tests_data["license_indexer"]
self.ctags_indexer = tests_data["ctags_indexer"]
def content_add_mimetype(self, cnt_id):
self.mimetype_indexer.run([hash_to_bytes(cnt_id)])
def content_get_mimetype(self, cnt_id):
mimetype = self.idx_storage.content_mimetype_get([hash_to_bytes(cnt_id)])[
0
].to_dict()
return converters.from_filetype(mimetype)
def content_add_license(self, cnt_id):
self.license_indexer.run([hash_to_bytes(cnt_id)])
def content_get_license(self, cnt_id):
cnt_id_bytes = hash_to_bytes(cnt_id)
licenses = self.idx_storage.content_fossology_license_get([cnt_id_bytes])
for license in licenses:
yield converters.from_swh(license.to_dict(), hashess={"id"})
def content_add_ctags(self, cnt_id):
self.ctags_indexer.run([hash_to_bytes(cnt_id)])
def content_get_ctags(self, cnt_id):
cnt_id_bytes = hash_to_bytes(cnt_id)
ctags = self.idx_storage.content_ctags_get([cnt_id_bytes])
for ctag in ctags:
yield converters.from_swh(ctag, hashess={"id"})
@pytest.fixture
def keycloak_oidc(keycloak_oidc, mocker):
keycloak_config = get_config()["keycloak"]
keycloak_oidc.server_url = keycloak_config["server_url"]
keycloak_oidc.realm_name = keycloak_config["realm_name"]
keycloak_oidc.client_id = OIDC_SWH_WEB_CLIENT_ID
keycloak_oidc_client = mocker.patch("swh.web.auth.views.keycloak_oidc_client")
keycloak_oidc_client.return_value = keycloak_oidc
return keycloak_oidc
@pytest.fixture
def subtest(request):
"""A hack to explicitly set up and tear down fixtures.
This fixture allows you to set up and tear down fixtures within the test
function itself. This is useful (necessary!) for using Hypothesis inside
pytest, as hypothesis will call the test function multiple times, without
setting up or tearing down fixture state as it is normally the case.
Copied from the pytest-subtesthack project, public domain license
(https://github.com/untitaker/pytest-subtesthack).
"""
parent_test = request.node
def inner(func):
if hasattr(Function, "from_parent"):
item = Function.from_parent(
parent_test,
name=request.function.__name__ + "[]",
originalname=request.function.__name__,
callobj=func,
)
else:
item = Function(
name=request.function.__name__ + "[]", parent=parent_test, callobj=func
)
nextitem = parent_test # prevents pytest from tearing down module fixtures
item.ihook.pytest_runtest_setup(item=item)
item.ihook.pytest_runtest_call(item=item)
item.ihook.pytest_runtest_teardown(item=item, nextitem=nextitem)
return inner
@pytest.fixture
def swh_scheduler(swh_scheduler):
config = get_config()
scheduler = config["scheduler"]
config["scheduler"] = swh_scheduler
# create load-git and load-hg task types
for task_type in TASK_TYPES.values():
swh_scheduler.create_task_type(task_type)
# create load-svn task type
swh_scheduler.create_task_type(
{
"type": "load-svn",
- "description": "Update a mercurial repository",
+ "description": "Update a Subversion repository",
"backend_name": "swh.loader.svn.tasks.DumpMountAndLoadSvnRepository",
"default_interval": timedelta(days=64),
"min_interval": timedelta(hours=12),
"max_interval": timedelta(days=64),
"backoff_factor": 2,
"max_queue_length": None,
"num_retries": 7,
"retry_delay": timedelta(hours=2),
}
)
+ # create load-cvs task type
+ swh_scheduler.create_task_type(
+ {
+ "type": "load-cvs",
+ "description": "Update a CVS repository",
+ "backend_name": "swh.loader.cvs.tasks.DumpMountAndLoadSvnRepository",
+ "default_interval": timedelta(days=64),
+ "min_interval": timedelta(hours=12),
+ "max_interval": timedelta(days=64),
+ "backoff_factor": 2,
+ "max_queue_length": None,
+ "num_retries": 7,
+ "retry_delay": timedelta(hours=2),
+ }
+ )
# add method to add load-archive-files task type during tests
def add_load_archive_task_type():
swh_scheduler.create_task_type(
{
"type": "load-archive-files",
"description": "Load tarballs",
"backend_name": "swh.loader.package.archive.tasks.LoadArchive",
"default_interval": timedelta(days=64),
"min_interval": timedelta(hours=12),
"max_interval": timedelta(days=64),
"backoff_factor": 2,
"max_queue_length": None,
"num_retries": 7,
"retry_delay": timedelta(hours=2),
}
)
swh_scheduler.add_load_archive_task_type = add_load_archive_task_type
yield swh_scheduler
config["scheduler"] = scheduler
get_scheduler_load_task_types.cache_clear()
diff --git a/swh/web/tests/misc/test_origin_save.py b/swh/web/tests/misc/test_origin_save.py
index b357a138..bd9561d2 100644
--- a/swh/web/tests/misc/test_origin_save.py
+++ b/swh/web/tests/misc/test_origin_save.py
@@ -1,153 +1,153 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
import json
import pytest
from swh.auth.django.utils import oidc_user_from_profile
from swh.web.common.models import SaveOriginRequest
from swh.web.common.origin_save import SAVE_REQUEST_ACCEPTED, SAVE_TASK_SUCCEEDED
from swh.web.common.utils import reverse
from swh.web.tests.utils import check_http_get_response
-VISIT_TYPES = ("git", "svn", "hg")
+VISIT_TYPES = ("git", "svn", "hg", "cvs")
PRIVILEGED_VISIT_TYPES = tuple(list(VISIT_TYPES) + ["archives"])
def test_old_save_url_redirection(client):
url = reverse("browse-origin-save")
redirect_url = reverse("origin-save")
resp = check_http_get_response(client, url, status_code=302)
assert resp["location"] == redirect_url
@pytest.mark.django_db
def test_save_origin_requests_list(client, mocker):
nb_origins_per_type = 10
for visit_type in VISIT_TYPES:
for i in range(nb_origins_per_type):
SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type=visit_type,
origin_url=f"https://{visit_type}.example.org/project{i}",
status=SAVE_REQUEST_ACCEPTED,
visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1),
loading_task_id=i,
loading_task_status=SAVE_TASK_SUCCEEDED,
)
mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler")
mock_scheduler.get_tasks.return_value = []
mock_scheduler.get_task_runs.return_value = []
# retrieve all save requests in 3 pages, sorted in descending order
# of request creation
for i, visit_type in enumerate(reversed(VISIT_TYPES)):
url = reverse(
"origin-save-requests-list",
url_args={"status": "all"},
query_params={
"draw": i + 1,
"search[value]": "",
"order[0][column]": "0",
"columns[0][name]": "request_date",
"order[0][dir]": "desc",
"length": nb_origins_per_type,
"start": i * nb_origins_per_type,
},
)
resp = check_http_get_response(
client, url, status_code=200, content_type="application/json"
)
sors = json.loads(resp.content.decode("utf-8"))
assert sors["draw"] == i + 1
assert sors["recordsFiltered"] == len(VISIT_TYPES) * nb_origins_per_type
assert sors["recordsTotal"] == len(VISIT_TYPES) * nb_origins_per_type
assert len(sors["data"]) == nb_origins_per_type
assert all(d["visit_type"] == visit_type for d in sors["data"])
# retrieve save requests filtered by visit type in a single page
for i, visit_type in enumerate(reversed(VISIT_TYPES)):
url = reverse(
"origin-save-requests-list",
url_args={"status": "all"},
query_params={
"draw": i + 1,
"search[value]": visit_type,
"order[0][column]": "0",
"columns[0][name]": "request_date",
"order[0][dir]": "desc",
"length": nb_origins_per_type,
"start": 0,
},
)
resp = check_http_get_response(
client, url, status_code=200, content_type="application/json"
)
sors = json.loads(resp.content.decode("utf-8"))
assert sors["draw"] == i + 1
assert sors["recordsFiltered"] == nb_origins_per_type
assert sors["recordsTotal"] == len(VISIT_TYPES) * nb_origins_per_type
assert len(sors["data"]) == nb_origins_per_type
assert all(d["visit_type"] == visit_type for d in sors["data"])
@pytest.mark.django_db
def test_save_origin_requests_list_user_filter(client, mocker, keycloak_oidc):
# anonymous user created a save request
sor = SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type="svn",
origin_url="https://svn.example.org/user/project",
status=SAVE_REQUEST_ACCEPTED,
visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1),
loading_task_id=1,
loading_task_status=SAVE_TASK_SUCCEEDED,
)
# authenticated user created a save request
user = oidc_user_from_profile(keycloak_oidc, keycloak_oidc.login())
client.login(code="", code_verifier="", redirect_uri="")
sor = SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type="git",
origin_url="https://git.example.org/user/project",
status=SAVE_REQUEST_ACCEPTED,
visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1),
loading_task_id=2,
loading_task_status=SAVE_TASK_SUCCEEDED,
user_ids=f'"{user.id}"',
)
# filter save requests according to user id
url = reverse(
"origin-save-requests-list",
url_args={"status": "all"},
query_params={
"draw": 1,
"search[value]": "",
"order[0][column]": "0",
"columns[0][name]": "request_date",
"order[0][dir]": "desc",
"length": 10,
"start": "0",
"user_requests_only": "1",
},
)
resp = check_http_get_response(
client, url, status_code=200, content_type="application/json"
)
sors = json.loads(resp.content.decode("utf-8"))
assert sors["recordsFiltered"] == 1
assert sors["recordsTotal"] == 2
assert sors["data"][0] == sor.to_dict()