You can contribute to extend the content of the Software Heritage archive by submitting an origin
save request. To do so, fill the required info in the form below:
Processing "save code now" request ...
A "Save code now" request takes the following parameters:
- Visit type: the type of version control system the software origin is using.
Currently, the supported types are:
- Origin url: the url of the remote repository for the software origin.
In order to avoid saving errors from Software Heritage, you should provide the clone/checkout url
as given by the provider hosting the software origin.
It can easily be found in the
web interface used to browse the software origin.
For instance, if you want to save a git
origin into the archive, you should check that the command $ git clone <origin_url>
does not return an error before submitting a request.
Once submitted, your save request can either be:
- accepted: a visit to the provided origin will then be scheduled by Software Heritage in order to
load its content into the archive as soon as possible
- rejected: the provided origin url is blacklisted and no visit will be scheduled
- put in pending state: a manual review will then be performed in order to determine if the
origin can be safely loaded or not into the archive
Once a save request has been accepted, you can follow its current status in the
submitted save requests list.
Date |
Type |
Url |
Request |
Status |
+ Info |
{% endblock %}
\ No newline at end of file
diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py
index a414821b..4e24721a 100644
--- a/swh/web/tests/common/test_origin_save.py
+++ b/swh/web/tests/common/test_origin_save.py
@@ -1,204 +1,226 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
from datetime import datetime, timedelta, timezone
from functools import partial
import pytest
import requests
from swh.core.pytest_plugin import get_response_cb
from swh.web.common.models import SaveOriginRequest
from swh.web.common.origin_save import (
get_save_origin_task_info,
get_save_origin_requests,
)
from swh.web.common.typing import OriginVisitInfo
from swh.web.config import get_config
_es_url = "http://esnode1.internal.softwareheritage.org:9200"
_es_workers_index_url = "%s/swh_workers-*" % _es_url
_origin_url = "https://gitlab.com/inkscape/inkscape"
_visit_type = "git"
_task_id = 203525448
@pytest.fixture(autouse=True)
def requests_mock_datadir(datadir, requests_mock_datadir):
"""Override default behavior to deal with post method
"""
cb = partial(get_response_cb, datadir=datadir)
requests_mock_datadir.post(re.compile("https?://"), body=cb)
return requests_mock_datadir
@pytest.mark.django_db
def test_get_save_origin_archived_task_info(mocker):
_get_save_origin_task_info_test(mocker, task_archived=True)
@pytest.mark.django_db
-def test_get_save_origin_task_info_with_es(mocker):
+def test_get_save_origin_task_full_info_with_es(mocker):
_get_save_origin_task_info_test(mocker, es_available=True)
+@pytest.mark.django_db
+def test_get_save_origin_task_info_with_es(mocker):
+ _get_save_origin_task_info_test(mocker, es_available=True, full_info=False)
+
+
@pytest.mark.django_db
def test_get_save_origin_task_info_without_es(mocker):
_get_save_origin_task_info_test(mocker, es_available=False)
def _mock_scheduler(mocker, task_status="succeed", task_archived=False):
mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler")
task = (
{
"arguments": {"args": [], "kwargs": {"repo_url": _origin_url},},
"current_interval": timedelta(days=64),
"id": _task_id,
"next_run": datetime.now(tz=timezone.utc) + timedelta(days=64),
"policy": "oneshot",
"priority": "high",
"retries_left": 0,
"status": "disabled",
"type": "load-git",
}
if not task_archived
else None
)
- mock_scheduler.get_tasks.return_value = [task]
+ mock_scheduler.get_tasks.return_value = [dict(task) if task else None]
task_run = {
"backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205",
"ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5),
"id": 654270631,
"metadata": {},
"scheduled": datetime.now(tz=timezone.utc),
"started": None,
"status": task_status,
"task": _task_id,
}
- mock_scheduler.get_task_runs.return_value = [task_run]
+ mock_scheduler.get_task_runs.return_value = [dict(task_run)]
return task, task_run
-def _get_save_origin_task_info_test(mocker, task_archived=False, es_available=True):
+def _get_save_origin_task_info_test(
+ mocker, task_archived=False, es_available=True, full_info=True
+):
swh_web_config = get_config()
if es_available:
swh_web_config.update({"es_workers_index_url": _es_workers_index_url})
else:
swh_web_config.update({"es_workers_index_url": ""})
sor = SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type=_visit_type,
origin_url="https://gitlab.com/inkscape/inkscape",
status="accepted",
visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1),
loading_task_id=_task_id,
)
task, task_run = _mock_scheduler(mocker, task_archived=task_archived)
es_response = requests.post("%s/_search" % _es_workers_index_url).json()
task_exec_data = es_response["hits"]["hits"][-1]["_source"]
- sor_task_info = get_save_origin_task_info(sor.id)
+ sor_task_info = get_save_origin_task_info(sor.id, full_info=full_info)
expected_result = (
{
"type": task["type"],
"arguments": task["arguments"],
"id": task["id"],
"backend_id": task_run["backend_id"],
"scheduled": task_run["scheduled"],
+ "started": task_run["started"],
"ended": task_run["ended"],
"status": task_run["status"],
}
if not task_archived
else {}
)
if es_available and not task_archived:
expected_result.update(
{
"message": task_exec_data["message"],
"name": task_exec_data["swh_task_name"],
"worker": task_exec_data["hostname"],
}
)
+ if not full_info:
+ expected_result.pop("id", None)
+ expected_result.pop("backend_id", None)
+ expected_result.pop("worker", None)
+ if "message" in expected_result:
+ message = ""
+ message_lines = expected_result["message"].split("\n")
+ for line in message_lines:
+ if line.startswith("Traceback"):
+ break
+ message += f"{line}\n"
+ message += message_lines[-1]
+ expected_result["message"] = message
+
assert sor_task_info == expected_result
@pytest.mark.django_db
def test_get_save_origin_requests_find_visit_date(mocker):
# create a save request
SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type=_visit_type,
origin_url=_origin_url,
status="accepted",
visit_date=None,
loading_task_id=_task_id,
)
# mock scheduler and services
_mock_scheduler(mocker)
mock_service = mocker.patch("swh.web.common.origin_save.service")
mock_service.lookup_origin.return_value = {"url": _origin_url}
mock_get_origin_visits = mocker.patch(
"swh.web.common.origin_save.get_origin_visits"
)
# create a visit for the save request
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
date=visit_date,
formatted_date="",
metadata={},
origin=_origin_url,
snapshot="",
status="full",
type=_visit_type,
url="",
visit=34,
)
mock_get_origin_visits.return_value = [visit_info]
# check visit date has been correctly found
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 1
assert sors[0]["visit_date"] == visit_date
mock_get_origin_visits.assert_called_once()
# check visit is not searched again when it has been found
get_save_origin_requests(_visit_type, _origin_url)
mock_get_origin_visits.assert_called_once()
# check visit date are not searched for save requests older than
# one month
sor = SaveOriginRequest.objects.create(
visit_type=_visit_type,
origin_url=_origin_url,
status="accepted",
loading_task_id=_task_id,
visit_date=None,
)
sor.request_date = datetime.now(tz=timezone.utc) - timedelta(days=31)
sor.save()
_mock_scheduler(mocker, task_status="failed")
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 2
assert sors[0]["visit_date"] is None
mock_get_origin_visits.assert_called_once()