Changeset View
Changeset View
Standalone View
Standalone View
swh/web/common/origin_save.py
Show All 32 Lines | from swh.web.common.models import ( | ||||
SAVE_TASK_RUNNING, | SAVE_TASK_RUNNING, | ||||
SAVE_TASK_SCHEDULED, | SAVE_TASK_SCHEDULED, | ||||
SAVE_TASK_SUCCEEDED, | SAVE_TASK_SUCCEEDED, | ||||
SaveAuthorizedOrigin, | SaveAuthorizedOrigin, | ||||
SaveOriginRequest, | SaveOriginRequest, | ||||
SaveUnauthorizedOrigin, | SaveUnauthorizedOrigin, | ||||
) | ) | ||||
from swh.web.common.origin_visits import get_origin_visits | from swh.web.common.origin_visits import get_origin_visits | ||||
from swh.web.common.typing import OriginInfo, SaveOriginRequestInfo | from swh.web.common.typing import ( | ||||
OriginExistenceCheckInfo, | |||||
OriginInfo, | |||||
SaveOriginRequestInfo, | |||||
) | |||||
from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc | from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc | ||||
scheduler = config.scheduler() | scheduler = config.scheduler() | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def get_origin_save_authorized_urls() -> List[str]: | def get_origin_save_authorized_urls() -> List[str]: | ||||
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines | def _check_origin_url_valid(origin_url: str) -> None: | ||||
try: | try: | ||||
_validate_url(origin_url) | _validate_url(origin_url) | ||||
except ValidationError: | except ValidationError: | ||||
raise BadInputExc( | raise BadInputExc( | ||||
"The provided origin url (%s) is not valid!" % escape(origin_url) | "The provided origin url (%s) is not valid!" % escape(origin_url) | ||||
) | ) | ||||
def origin_exists(origin_url: str) -> OriginExistenceCheckInfo: | |||||
"""Check the origin url for existence. If it exists, extract some more useful | |||||
information on the origin. | |||||
""" | |||||
resp = requests.head(origin_url) | |||||
anlambert: I think you can drop the `is True`. | |||||
Done Inline Actionsindeed, i wanted it to stand out but the simple version sounds enough today ;) ardumont: indeed, i wanted it to stand out but the simple version sounds enough today ;) | |||||
exists = resp.ok is True | |||||
content_length: Optional[int] = None | |||||
last_modified: Optional[str] = None | |||||
if exists: | |||||
size_ = resp.headers.get("Content-Length") | |||||
content_length = int(size_) if size_ else None | |||||
last_modified = resp.headers.get("Last-Modified") | |||||
return OriginExistenceCheckInfo( | |||||
origin_url=origin_url, | |||||
exists=exists, | |||||
last_modified=last_modified, | |||||
content_length=content_length, | |||||
) | |||||
def _check_origin_exists(origin_url: str) -> None: | |||||
"""Ensure the origin exists, if not raise an explicit message.""" | |||||
check = origin_exists(origin_url) | |||||
if not check["exists"]: | |||||
raise BadInputExc( | |||||
f"The provided origin url ({escape(origin_url)}) does not exist!" | |||||
) | |||||
def _get_visit_info_for_save_request( | def _get_visit_info_for_save_request( | ||||
save_request: SaveOriginRequest, | save_request: SaveOriginRequest, | ||||
) -> Tuple[Optional[datetime], Optional[str]]: | ) -> Tuple[Optional[datetime], Optional[str]]: | ||||
"""Retrieve visit information out of a save request | """Retrieve visit information out of a save request | ||||
Args: | Args: | ||||
save_request: Input save origin request to retrieve information for. | save_request: Input save origin request to retrieve information for. | ||||
▲ Show 20 Lines • Show All 142 Lines • ▼ Show 20 Lines | ) -> SaveOriginRequestInfo: | ||||
url are valid but also if the the save request can be accepted. | url are valid but also if the the save request can be accepted. | ||||
If those checks passed, the loading task is then created. | If those checks passed, the loading task is then created. | ||||
Otherwise, the save request is put in pending or rejected state. | Otherwise, the save request is put in pending or rejected state. | ||||
All the submitted save requests are logged into the swh-web | All the submitted save requests are logged into the swh-web | ||||
database to keep track of them. | database to keep track of them. | ||||
Args: | Args: | ||||
visit_type (str): the type of visit to perform (currently only | visit_type: the type of visit to perform (e.g git, hg, svn, ...) | ||||
``git`` but ``svn`` and ``hg`` will soon be available) | origin_url: the url of the origin to save | ||||
origin_url (str): the url of the origin to save | |||||
Raises: | Raises: | ||||
BadInputExc: the visit type or origin url is invalid | BadInputExc: the visit type or origin url is invalid or inexistent | ||||
ForbiddenExc: the provided origin url is blacklisted | ForbiddenExc: the provided origin url is blacklisted | ||||
Returns: | Returns: | ||||
dict: A dict describing the save request with the following keys: | dict: A dict describing the save request with the following keys: | ||||
* **visit_type**: the type of visit to perform | * **visit_type**: the type of visit to perform | ||||
* **origin_url**: the url of the origin | * **origin_url**: the url of the origin | ||||
* **save_request_date**: the date the request was submitted | * **save_request_date**: the date the request was submitted | ||||
* **save_request_status**: the request status, either **accepted**, | * **save_request_status**: the request status, either **accepted**, | ||||
**rejected** or **pending** | **rejected** or **pending** | ||||
* **save_task_status**: the origin loading task status, either | * **save_task_status**: the origin loading task status, either | ||||
**not created**, **not yet scheduled**, **scheduled**, | **not created**, **not yet scheduled**, **scheduled**, | ||||
**succeed** or **failed** | **succeed** or **failed** | ||||
""" | """ | ||||
_check_visit_type_savable(visit_type) | _check_visit_type_savable(visit_type) | ||||
_check_origin_url_valid(origin_url) | _check_origin_url_valid(origin_url) | ||||
_check_origin_exists(origin_url) | |||||
# if all checks passed so far, we can try and save the origin | |||||
save_request_status = can_save_origin(origin_url) | save_request_status = can_save_origin(origin_url) | ||||
task = None | task = None | ||||
# if the origin save request is accepted, create a scheduler | # if the origin save request is accepted, create a scheduler | ||||
# task to load it into the archive | # task to load it into the archive | ||||
if save_request_status == SAVE_REQUEST_ACCEPTED: | if save_request_status == SAVE_REQUEST_ACCEPTED: | ||||
# create a task with high priority | # create a task with high priority | ||||
kwargs = { | kwargs = { | ||||
▲ Show 20 Lines • Show All 414 Lines • Show Last 20 Lines |
I think you can drop the is True.