Changeset View
Changeset View
Standalone View
Standalone View
swh/web/common/origin_save.py
Show First 20 Lines • Show All 105 Lines • ▼ Show 20 Lines | if bypass_pending_review: | ||||
SaveAuthorizedOrigin.objects.get_or_create(url=origin_url) | SaveAuthorizedOrigin.objects.get_or_create(url=origin_url) | ||||
return SAVE_REQUEST_ACCEPTED | return SAVE_REQUEST_ACCEPTED | ||||
else: | else: | ||||
return SAVE_REQUEST_PENDING | return SAVE_REQUEST_PENDING | ||||
# map visit type to scheduler task | # map visit type to scheduler task | ||||
# TODO: do not hardcode the task name here (T1157) | # TODO: do not hardcode the task name here (T1157) | ||||
_visit_type_task = {"git": "load-git", "hg": "load-hg", "svn": "load-svn"} | _visit_type_task = { | ||||
"git": "load-git", | |||||
"hg": "load-hg", | |||||
"svn": "load-svn", | |||||
# TODO: Limit access to ambassador | |||||
"bundle": "load-archive-files", | |||||
} | |||||
# map scheduler task status to origin save status | # map scheduler task status to origin save status | ||||
_save_task_status = { | _save_task_status = { | ||||
"next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED, | "next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED, | ||||
"next_run_scheduled": SAVE_TASK_SCHEDULED, | "next_run_scheduled": SAVE_TASK_SCHEDULED, | ||||
"completed": SAVE_TASK_SUCCEEDED, | "completed": SAVE_TASK_SUCCEEDED, | ||||
"disabled": SAVE_TASK_FAILED, | "disabled": SAVE_TASK_FAILED, | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | def origin_exists(origin_url: str) -> OriginExistenceCheckInfo: | ||||
""" | """ | ||||
resp = requests.head(origin_url) | resp = requests.head(origin_url) | ||||
exists = resp.ok | exists = resp.ok | ||||
content_length: Optional[int] = None | content_length: Optional[int] = None | ||||
last_modified: Optional[str] = None | last_modified: Optional[str] = None | ||||
if exists: | if exists: | ||||
size_ = resp.headers.get("Content-Length") | size_ = resp.headers.get("Content-Length") | ||||
content_length = int(size_) if size_ else None | content_length = int(size_) if size_ else None | ||||
last_modified = resp.headers.get("Last-Modified") | try: | ||||
date_str = resp.headers["Last-Modified"] | |||||
date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z") | |||||
last_modified = date.isoformat() | |||||
except (KeyError, ValueError): | |||||
# if not provided or not parsable, simply keep it None | |||||
pass | |||||
return OriginExistenceCheckInfo( | return OriginExistenceCheckInfo( | ||||
origin_url=origin_url, | origin_url=origin_url, | ||||
exists=exists, | exists=exists, | ||||
last_modified=last_modified, | last_modified=last_modified, | ||||
content_length=content_length, | content_length=content_length, | ||||
) | ) | ||||
def _check_origin_exists(origin_url: str) -> None: | def _check_origin_exists(origin_url: Optional[str]) -> OriginExistenceCheckInfo: | ||||
"""Ensure the origin exists, if not raise an explicit message.""" | """Ensure the origin exists, if not raise an explicit message.""" | ||||
check = origin_exists(origin_url) | if not origin_url: | ||||
if not check["exists"]: | raise BadInputExc("The origin url provided must be set!") | ||||
metadata = origin_exists(origin_url) | |||||
if not metadata["exists"]: | |||||
raise BadInputExc( | raise BadInputExc( | ||||
f"The provided origin url ({escape(origin_url)}) does not exist!" | f"The provided origin url ({escape(origin_url)}) does not exist!" | ||||
) | ) | ||||
return metadata | |||||
def _get_visit_info_for_save_request( | def _get_visit_info_for_save_request( | ||||
save_request: SaveOriginRequest, | save_request: SaveOriginRequest, | ||||
) -> Tuple[Optional[datetime], Optional[str]]: | ) -> Tuple[Optional[datetime], Optional[str]]: | ||||
"""Retrieve visit information out of a save request | """Retrieve visit information out of a save request | ||||
Args: | Args: | ||||
save_request: Input save origin request to retrieve information for. | save_request: Input save origin request to retrieve information for. | ||||
▲ Show 20 Lines • Show All 130 Lines • ▼ Show 20 Lines | ) -> SaveOriginRequestInfo: | ||||
return save_request.to_dict() | return save_request.to_dict() | ||||
def create_save_origin_request( | def create_save_origin_request( | ||||
visit_type: str, | visit_type: str, | ||||
origin_url: str, | origin_url: str, | ||||
bypass_pending_review: bool = False, | bypass_pending_review: bool = False, | ||||
user_id: Optional[int] = None, | user_id: Optional[int] = None, | ||||
**kwargs, | |||||
) -> SaveOriginRequestInfo: | ) -> SaveOriginRequestInfo: | ||||
""" | """Create a loading task to save a software origin into the archive. | ||||
Create a loading task to save a software origin into the archive. | |||||
This function aims to create a software origin loading task | This function aims to create a software origin loading task trough the use of the | ||||
trough the use of the swh-scheduler component. | swh-scheduler component. | ||||
First, some checks are performed to see if the visit type and origin | First, some checks are performed to see if the visit type and origin url are valid | ||||
url are valid but also if the the save request can be accepted. | but also if the the save request can be accepted. For the 'bundle' visit type, this | ||||
If those checks passed, the loading task is then created. | also ensures the artifacts actually exists. If those checks passed, the loading task | ||||
Otherwise, the save request is put in pending or rejected state. | is then created. Otherwise, the save request is put in pending or rejected state. | ||||
All the submitted save requests are logged into the swh-web | All the submitted save requests are logged into the swh-web database to keep track | ||||
database to keep track of them. | of them. | ||||
Args: | Args: | ||||
visit_type: the type of visit to perform (e.g git, hg, svn, ...) | visit_type: the type of visit to perform (e.g. git, hg, svn, bundle, ...) | ||||
origin_url: the url of the origin to save | origin_url: the url of the origin to save | ||||
kwargs: Optional parameters (e.g. artifact_url, artifact_filename, | |||||
artifact_version) | |||||
Raises: | Raises: | ||||
BadInputExc: the visit type or origin url is invalid or inexistent | BadInputExc: the visit type or origin url is invalid or inexistent | ||||
ForbiddenExc: the provided origin url is blacklisted | ForbiddenExc: the provided origin url is blacklisted | ||||
Returns: | Returns: | ||||
dict: A dict describing the save request with the following keys: | dict: A dict describing the save request with the following keys: | ||||
* **visit_type**: the type of visit to perform | * **visit_type**: the type of visit to perform | ||||
* **origin_url**: the url of the origin | * **origin_url**: the url of the origin | ||||
* **save_request_date**: the date the request was submitted | * **save_request_date**: the date the request was submitted | ||||
* **save_request_status**: the request status, either **accepted**, | * **save_request_status**: the request status, either **accepted**, | ||||
**rejected** or **pending** | **rejected** or **pending** | ||||
* **save_task_status**: the origin loading task status, either | * **save_task_status**: the origin loading task status, either | ||||
**not created**, **not yet scheduled**, **scheduled**, | **not created**, **not yet scheduled**, **scheduled**, | ||||
**succeed** or **failed** | **succeed** or **failed** | ||||
""" | """ | ||||
_check_visit_type_savable(visit_type) | _check_visit_type_savable(visit_type) | ||||
_check_origin_url_valid(origin_url) | _check_origin_url_valid(origin_url) | ||||
artifact_url = kwargs.get("artifact_url") | |||||
if visit_type == "bundle": | |||||
metadata = _check_origin_exists(artifact_url) | |||||
# if all checks passed so far, we can try and save the origin | # if all checks passed so far, we can try and save the origin | ||||
save_request_status = can_save_origin(origin_url, bypass_pending_review) | save_request_status = can_save_origin(origin_url, bypass_pending_review) | ||||
task = None | task = None | ||||
# if the origin save request is accepted, create a scheduler | # if the origin save request is accepted, create a scheduler | ||||
# task to load it into the archive | # task to load it into the archive | ||||
if save_request_status == SAVE_REQUEST_ACCEPTED: | if save_request_status == SAVE_REQUEST_ACCEPTED: | ||||
# create a task with high priority | # create a task with high priority | ||||
kwargs = { | task_kwargs: Dict[str, Any] = { | ||||
"priority": "high", | "priority": "high", | ||||
"url": origin_url, | "url": origin_url, | ||||
} | } | ||||
if visit_type == "bundle": | |||||
# extra arguments for that type are required | |||||
assert metadata is not None | |||||
task_kwargs = dict( | |||||
**task_kwargs, | |||||
artifacts=[ | |||||
{ | |||||
"url": artifact_url, | |||||
"filename": kwargs["artifact_filename"], | |||||
"version": kwargs["artifact_version"], | |||||
"time": metadata["last_modified"], | |||||
"length": metadata["content_length"], | |||||
} | |||||
], | |||||
) | |||||
sor = None | sor = None | ||||
# get list of previously sumitted save requests | # get list of previously sumitted save requests | ||||
current_sors = list( | current_sors = list( | ||||
SaveOriginRequest.objects.filter( | SaveOriginRequest.objects.filter( | ||||
visit_type=visit_type, origin_url=origin_url | visit_type=visit_type, origin_url=origin_url | ||||
) | ) | ||||
) | ) | ||||
Show All 25 Lines | if save_request_status == SAVE_REQUEST_ACCEPTED: | ||||
): | ): | ||||
can_create_task = True | can_create_task = True | ||||
sor = None | sor = None | ||||
else: | else: | ||||
can_create_task = False | can_create_task = False | ||||
if can_create_task: | if can_create_task: | ||||
# effectively create the scheduler task | # effectively create the scheduler task | ||||
task_dict = create_oneshot_task_dict(_visit_type_task[visit_type], **kwargs) | task_dict = create_oneshot_task_dict( | ||||
_visit_type_task[visit_type], **task_kwargs | |||||
) | |||||
task = scheduler.create_tasks([task_dict])[0] | task = scheduler.create_tasks([task_dict])[0] | ||||
# pending save request has been accepted | # pending save request has been accepted | ||||
if sor: | if sor: | ||||
sor.status = SAVE_REQUEST_ACCEPTED | sor.status = SAVE_REQUEST_ACCEPTED | ||||
sor.loading_task_id = task["id"] | sor.loading_task_id = task["id"] | ||||
sor.save() | sor.save() | ||||
else: | else: | ||||
▲ Show 20 Lines • Show All 368 Lines • Show Last 20 Lines |