diff --git a/docs/uri-scheme-api-origin.rst b/docs/uri-scheme-api-origin.rst index 0fa7d864..f4202406 100644 --- a/docs/uri-scheme-api-origin.rst +++ b/docs/uri-scheme-api-origin.rst @@ -1,10 +1,24 @@ Origin ------ .. autosimple:: swh.web.api.views.origin.api_origin .. autosimple:: swh.web.api.views.origin.api_origin_search .. autosimple:: swh.web.api.views.origin.api_origin_visits .. autosimple:: swh.web.api.views.origin.api_origin_visit + +.. autosimple:: swh.web.api.views.origin.api_origin_visit + +.. autosimple:: swh.web.save_code_now.api_views.api_save_origin + +.. autosimple:: swh.web.save_origin_webhooks.bitbucket.api_origin_save_webhook_bitbucket + +.. autosimple:: swh.web.save_origin_webhooks.gitea.api_origin_save_webhook_gitea + +.. autosimple:: swh.web.save_origin_webhooks.github.api_origin_save_webhook_github + +.. autosimple:: swh.web.save_origin_webhooks.gitlab.api_origin_save_webhook_gitlab + +.. autosimple:: swh.web.save_origin_webhooks.sourceforge.api_origin_save_webhook_sourceforge diff --git a/swh/web/api/apidoc.py b/swh/web/api/apidoc.py index 25b1f5c9..7f023e97 100644 --- a/swh/web/api/apidoc.py +++ b/swh/web/api/apidoc.py @@ -1,487 +1,488 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import functools from functools import wraps import os import re import textwrap from typing import List import docutils.nodes import docutils.parsers.rst import docutils.utils from django.shortcuts import redirect from rest_framework.decorators import api_view from swh.web.api.apiresponse import make_api_response from swh.web.api.apiurls import CategoryId, api_urls from swh.web.utils import parse_rst, reverse class _HTTPDomainDocVisitor(docutils.nodes.NodeVisitor): """ docutils visitor for walking on a parsed docutils document containing sphinx httpdomain roles. Its purpose is to extract relevant info regarding swh api endpoints (for instance url arguments) from their docstring written using sphinx httpdomain; and produce the main description back into a ReST string """ # httpdomain roles we want to parse (based on sphinxcontrib.httpdomain 1.6) parameter_roles = ("param", "parameter", "arg", "argument") request_json_object_roles = ("reqjsonobj", "reqjson", "jsonobj", ">json") response_json_array_roles = ("resjsonarr", ">jsonarr") query_parameter_roles = ("queryparameter", "queryparam", "qparam", "query") request_header_roles = ("header", "resheader", "responseheader") status_code_roles = ("statuscode", "status", "code") def __init__(self, document, data): super().__init__(document) self.data = data self.args_set = set() self.params_set = set() self.inputs_set = set() self.returns_set = set() self.status_codes_set = set() self.reqheaders_set = set() self.resheaders_set = set() self.current_json_obj = None self.current_field_name = None def _default_visit(self, node: docutils.nodes.Element) -> str: """Simply visits a text node, drops its start and end tags, visits the children, and concatenates their results.""" return "".join(map(self.dispatch_visit, node.children)) def visit_emphasis(self, node: docutils.nodes.emphasis) -> str: return f"*{self._default_visit(node)}*" def visit_strong(self, node: docutils.nodes.emphasis) -> str: return f"**{self._default_visit(node)}**" def visit_reference(self, node: docutils.nodes.reference) -> str: text = self._default_visit(node) refuri = node.attributes.get("refuri") if refuri is not None: return f"`{text} <{refuri}>`__" else: return f"`{text}`_" def visit_target(self, node: docutils.nodes.reference) -> str: parts = ["\n"] parts.extend( f".. _{name}: {node.attributes['refuri']}" for name in node.attributes["names"] ) return "\n".join(parts) def visit_literal(self, node: docutils.nodes.literal) -> str: return f"``{self._default_visit(node)}``" def visit_field_name(self, node: docutils.nodes.field_name) -> str: self.current_field_name = node.astext() return "" def visit_field_body(self, node: docutils.nodes.field_body) -> str: text = self._default_visit(node).strip() assert text, str(node) field_data = self.current_field_name.split(" ") # Parameters if field_data[0] in self.parameter_roles: if field_data[2] not in self.args_set: self.data["args"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.args_set.add(field_data[2]) # Query Parameters if field_data[0] in self.query_parameter_roles: if field_data[2] not in self.params_set: self.data["params"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.params_set.add(field_data[2]) # Request data type if ( field_data[0] in self.request_json_array_roles or field_data[0] in self.request_json_object_roles ): # array if field_data[0] in self.request_json_array_roles: self.data["input_type"] = "array" # object else: self.data["input_type"] = "object" # input object field if field_data[2] not in self.inputs_set: self.data["inputs"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.inputs_set.add(field_data[2]) self.current_json_obj = self.data["inputs"][-1] # Response type if ( field_data[0] in self.response_json_array_roles or field_data[0] in self.response_json_object_roles ): # array if field_data[0] in self.response_json_array_roles: self.data["return_type"] = "array" # object else: self.data["return_type"] = "object" # returned object field if field_data[2] not in self.returns_set: self.data["returns"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.returns_set.add(field_data[2]) self.current_json_obj = self.data["returns"][-1] # Status Codes if field_data[0] in self.status_code_roles: if field_data[1] not in self.status_codes_set: self.data["status_codes"].append({"code": field_data[1], "doc": text}) self.status_codes_set.add(field_data[1]) # Request Headers if field_data[0] in self.request_header_roles: if field_data[1] not in self.reqheaders_set: self.data["reqheaders"].append({"name": field_data[1], "doc": text}) self.reqheaders_set.add(field_data[1]) # Response Headers if field_data[0] in self.response_header_roles: if field_data[1] not in self.resheaders_set: resheader = {"name": field_data[1], "doc": text} self.data["resheaders"].append(resheader) self.resheaders_set.add(field_data[1]) if ( resheader["name"] == "Content-Type" and resheader["doc"] == "application/octet-stream" ): self.data["return_type"] = "octet stream" # Don't return anything in the description; these nodes only add text # to other fields return "" # We ignore these nodes and handle their subtrees directly in # visit_field_name and visit_field_body visit_field = visit_field_list = _default_visit def visit_paragraph(self, node: docutils.nodes.paragraph) -> str: """ Visit relevant paragraphs to parse """ # only parsed top level paragraphs text = self._default_visit(node) return "\n\n" + text def visit_literal_block(self, node: docutils.nodes.literal_block) -> str: """ Visit literal blocks """ text = node.astext() return f"\n\n::\n\n{textwrap.indent(text, ' ')}\n" def visit_bullet_list(self, node: docutils.nodes.bullet_list) -> str: parts = ["\n\n"] for child in node.traverse(): # process list item if isinstance(child, docutils.nodes.paragraph): line_text = self.dispatch_visit(child) parts.append("\t* %s\n" % textwrap.indent(line_text, "\t ").strip()) return "".join(parts) # visit_bullet_list collects and handles this with a more global view: visit_list_item = _default_visit def visit_warning(self, node: docutils.nodes.warning) -> str: text = self._default_visit(node) return "\n\n.. warning::\n%s\n" % textwrap.indent(text, "\t") def visit_Text(self, node: docutils.nodes.Text) -> str: """Leaf node""" return str(node).replace("\n", " ") # Prettier in generated HTML def visit_problematic(self, node: docutils.nodes.problematic) -> str: # api urls cleanup to generate valid links afterwards text = self._default_visit(node) subs_made = 1 while subs_made: (text, subs_made) = re.subn(r"(:http:.*)(\(\w+\))", r"\1", text) subs_made = 1 while subs_made: (text, subs_made) = re.subn(r"(:http:.*)(\[.*\])", r"\1", text) text = re.sub(r"([^:])//", r"\1/", text) # transform references to api endpoints doc into valid rst links text = re.sub(":http:get:`([^,`]*)`", r"`\1 <\1doc/>`_", text) + text = re.sub(":http:post:`([^,`]*)`", r"`\1 <\1doc/>`_", text) # transform references to some elements into bold text text = re.sub(":http:header:`(.*)`", r"**\1**", text) text = re.sub(":func:`(.*)`", r"**\1**", text) text = re.sub(":mod:`(.*)`", r"**\1**", text) # extract example urls if ":swh_web_api:" in text: # Extract examples to their own section examples_str = re.sub(":swh_web_api:`(.+)`.*", r"/api/1/\1", text) self.data["examples"] += examples_str.split("\n") return text def visit_block_quote(self, node: docutils.nodes.block_quote) -> str: return self._default_visit(node) return ( f".. code-block::\n" f"{textwrap.indent(self._default_visit(node), ' ')}\n" ) def visit_title_reference(self, node: docutils.nodes.title_reference) -> str: text = self._default_visit(node) raise Exception( f"Unexpected title reference. " f"Possible cause: you used `{text}` instead of ``{text}``" ) def visit_document(self, node: docutils.nodes.document) -> None: text = self._default_visit(node) # Strip examples; they are displayed separately text = re.split("\n\\*\\*Examples?:\\*\\*\n", text)[0] self.data["description"] = text.strip() def visit_system_message(self, node): return "" def unknown_visit(self, node) -> str: raise NotImplementedError( f"Unknown node type: {node.__class__.__name__}. Value: {node}" ) def unknown_departure(self, node): pass def _parse_httpdomain_doc(doc, data): doc_lines = doc.split("\n") doc_lines_filtered = [] urls = defaultdict(list) default_http_methods = ["HEAD", "OPTIONS"] # httpdomain is a sphinx extension that is unknown to docutils but # fortunately we can still parse its directives' content, # so remove lines with httpdomain directives before executing the # rst parser from docutils for doc_line in doc_lines: if ".. http" not in doc_line: doc_lines_filtered.append(doc_line) else: url = doc_line[doc_line.find("/") :] # emphasize url arguments for html rendering url = re.sub(r"\((\w+)\)", r" **\(\1\)** ", url) method = re.search(r"http:(\w+)::", doc_line).group(1) urls[url].append(method.upper()) for url, methods in urls.items(): data["urls"].append({"rule": url, "methods": methods + default_http_methods}) # parse the rst docstring and do not print system messages about # unknown httpdomain roles document = parse_rst("\n".join(doc_lines_filtered), report_level=5) # remove the system_message nodes from the parsed document for node in document.traverse(docutils.nodes.system_message): node.parent.remove(node) # visit the document nodes to extract relevant endpoint info visitor = _HTTPDomainDocVisitor(document, data) document.walkabout(visitor) class APIDocException(Exception): """ Custom exception to signal errors in the use of the APIDoc decorators """ def api_doc( route: str, *, category: CategoryId, noargs: bool = False, tags: List[str] = [], api_version: str = "1", ): """ Decorator for an API endpoint implementation used to generate a dedicated view displaying its HTML documentation. The documentation will be generated from the endpoint docstring based on sphinxcontrib-httpdomain format. Args: route: documentation page's route noargs: set to True if the route has no arguments, and its result should be displayed anytime its documentation is requested. Default to False tags: Further information on api endpoints. Two values are possibly expected: * hidden: remove the entry points from the listing * upcoming: display the entry point but it is not followable * deprecated: display the entry point as deprecated in the index api_version: api version string """ tags_set = set(tags) # @api_doc() Decorator call def decorator(f): # if the route is not hidden, add it to the index if "hidden" not in tags_set: doc_data = get_doc_data(f, route, noargs) doc_desc = doc_data["description"] api_urls.add_doc_route( route, category, re.split(r"\.\s", doc_desc)[0], noargs=noargs, api_version=api_version, tags=tags_set, ) # create a dedicated view to display endpoint HTML doc @api_view(["GET", "HEAD"]) @wraps(f) def doc_view(request): doc_data = get_doc_data(f, route, noargs) return make_api_response(request, None, doc_data) route_name = "%s-doc" % route[1:-1].replace("/", "-") urlpattern = f"^api/{api_version}{route}doc/$" view_name = "api-%s-%s" % (api_version, route_name) api_urls.add_url_pattern(urlpattern, doc_view, view_name) # for backward compatibility as previous apidoc URLs were missing # the /api prefix old_view_name = view_name.replace("api-", "") old_urlpattern = f"^{api_version}{route}doc/$" @api_view(["GET", "HEAD"]) def old_doc_view(request): return redirect(reverse(view_name)) api_urls.add_url_pattern(old_urlpattern, old_doc_view, old_view_name) @wraps(f) def documented_view(request, **kwargs): doc_data = get_doc_data(f, route, noargs) try: return {"data": f(request, **kwargs), "doc_data": doc_data} except Exception as exc: exc.doc_data = doc_data raise exc return documented_view return decorator @functools.lru_cache(maxsize=32) def get_doc_data(f, route, noargs): """ Build documentation data for the decorated api endpoint function """ data = { "description": "", "response_data": None, "urls": [], "args": [], "params": [], "input_type": "", "inputs": [], "resheaders": [], "reqheaders": [], "return_type": "", "returns": [], "status_codes": [], "examples": [], "route": route, "noargs": noargs, } if not f.__doc__: raise APIDocException( "apidoc: expected a docstring" " for function %s" % (f.__name__,) ) # use raw docstring as endpoint documentation if sphinx # httpdomain is not used if ".. http" not in f.__doc__: data["description"] = f.__doc__ # else parse the sphinx httpdomain docstring with docutils # (except when building the swh-web documentation through autodoc # sphinx extension, not needed and raise errors with sphinx >= 1.7) elif "SWH_DOC_BUILD" not in os.environ: _parse_httpdomain_doc(f.__doc__, data) # process input/returned object info for nicer html display inputs_list = "" returns_list = "" for inp in data["inputs"]: # special case for array of non object type, for instance # :jsonarr string -: an array of string if ret["name"] != "-": returns_list += "\t* **%s (%s)**: %s\n" % ( ret["name"], ret["type"], textwrap.indent(ret["doc"], "\t "), ) data["inputs_list"] = inputs_list data["returns_list"] = returns_list return data DOC_COMMON_HEADERS = """ :reqheader Accept: the requested response content type, either ``application/json`` (default) or ``application/yaml`` :resheader Content-Type: this depends on :http:header:`Accept` header of request""" DOC_RESHEADER_LINK = """ :resheader Link: indicates that a subsequent result page is available and contains the url pointing to it """ DEFAULT_SUBSTITUTIONS = { "common_headers": DOC_COMMON_HEADERS, "resheader_link": DOC_RESHEADER_LINK, } def format_docstring(**substitutions): def decorator(f): f.__doc__ = f.__doc__.format(**{**DEFAULT_SUBSTITUTIONS, **substitutions}) return f return decorator diff --git a/swh/web/save_code_now/api_views.py b/swh/web/save_code_now/api_views.py index bb796d86..abe6ec81 100644 --- a/swh/web/save_code_now/api_views.py +++ b/swh/web/save_code_now/api_views.py @@ -1,131 +1,161 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Optional, cast +from django.conf import settings from rest_framework.request import Request from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import APIUrls, api_route from swh.web.auth.utils import ( API_SAVE_ORIGIN_PERMISSION, SWH_AMBASSADOR_PERMISSION, privileged_user, ) from swh.web.save_code_now.origin_save import ( create_save_origin_request, get_savable_visit_types, get_save_origin_requests, ) def _savable_visit_types() -> str: docstring = "" if os.environ.get("DJANGO_SETTINGS_MODULE") != "swh.web.settings.tests": visit_types = sorted(get_savable_visit_types()) docstring = "" for visit_type in visit_types[:-1]: docstring += f"**{visit_type}**, " docstring += f"and **{visit_types[-1]}**" return docstring +def _webhook_info_doc() -> str: + docstring = "" + if "swh.web.save_origin_webhooks" in settings.SWH_DJANGO_APPS: + docstring = """ + :>json boolean from_webhook: indicates if the save request was created + from a popular forge webhook receiver + (see :http:post:`/api/1/origin/save/webhook/github/` for instance) + :>json string webhook_origin: indicates which forge type sent the webhook, + currently the supported types are:""" + + # instantiate webhook receivers + from swh.web.save_origin_webhooks import urls # noqa + from swh.web.save_origin_webhooks.generic_receiver import SUPPORTED_FORGE_TYPES + + webhook_forge_types = sorted(list(SUPPORTED_FORGE_TYPES)) + for visit_type in webhook_forge_types[:-1]: + docstring += f"**{visit_type}**, " + docstring += f"and **{webhook_forge_types[-1]}**" + return docstring + + save_code_now_api_urls = APIUrls() @api_route( r"/origin/save/(?P.+)/url/(?P.+)/", "api-1-save-origin", methods=["GET", "POST"], throttle_scope="swh_save_origin", never_cache=True, api_urls=save_code_now_api_urls, ) @api_doc("/origin/save/", category="Request archival") -@format_docstring(visit_types=_savable_visit_types()) +@format_docstring( + visit_types=_savable_visit_types(), webhook_info_doc=_webhook_info_doc() +) def api_save_origin(request: Request, visit_type: str, origin_url: str): """ .. http:get:: /api/1/origin/save/(visit_type)/url/(origin_url)/ .. http:post:: /api/1/origin/save/(visit_type)/url/(origin_url)/ Request the saving of a software origin into the archive or check the status of previously created save requests. That endpoint enables to create a saving task for a software origin through a POST request. Depending of the provided origin url, the save request can either be: * immediately **accepted**, for well known code hosting providers like for instance GitHub or GitLab * **rejected**, in case the url is blacklisted by Software Heritage * **put in pending state** until a manual check is done in order to determine if it can be loaded or not Once a saving request has been accepted, its associated saving task status can then be checked through a GET request on the same url. Returned status can either be: * **not created**: no saving task has been created * **not yet scheduled**: saving task has been created but its execution has not yet been scheduled * **scheduled**: the task execution has been scheduled * **succeeded**: the saving task has been successfully executed * **failed**: the saving task has been executed but it failed When issuing a POST request an object will be returned while a GET request will return an array of objects (as multiple save requests might have been submitted for the same origin). :param string visit_type: the type of visit to perform (currently the supported types are {visit_types}) :param string origin_url: the url of the origin to save {common_headers} :>json string origin_url: the url of the origin to save :>json string visit_type: the type of visit to perform :>json string save_request_date: the date (in iso format) the save request was issued :>json string save_request_status: the status of the save request, either **accepted**, **rejected** or **pending** :>json string save_task_status: the status of the origin saving task, either **not created**, **not yet scheduled**, **scheduled**, **succeeded** or **failed** :>json string visit_date: the date (in iso format) of the visit if a visit occurred, null otherwise. :>json string visit_status: the status of the visit, either **full**, **partial**, **not_found** or **failed** if a visit occurred, null otherwise. :>json string note: optional note giving details about the save request, for instance why it has been rejected + {webhook_info_doc} :statuscode 200: no error :statuscode 400: an invalid visit type or origin url has been provided :statuscode 403: the provided origin url is blacklisted :statuscode 404: no save requests have been found for a given origin """ + def _cleanup_sor_data(sor): + del sor["id"] + if "swh.web.save_origin_webhooks" not in settings.SWH_DJANGO_APPS: + del sor["from_webhook"] + del sor["webhook_origin"] + return sor + data = request.data or {} if request.method == "POST": sor = create_save_origin_request( visit_type, origin_url, privileged_user( request, permissions=[SWH_AMBASSADOR_PERMISSION, API_SAVE_ORIGIN_PERMISSION], ), user_id=cast(Optional[int], request.user.id), **data, ) - del sor["id"] - return sor + return _cleanup_sor_data(sor) + else: sors = get_save_origin_requests(visit_type, origin_url) - for sor in sors: - del sor["id"] - return sors + return [_cleanup_sor_data(sor) for sor in sors] diff --git a/swh/web/save_code_now/migrations/0013_saveoriginrequest_webhook_info.py b/swh/web/save_code_now/migrations/0013_saveoriginrequest_webhook_info.py new file mode 100644 index 00000000..70c8d716 --- /dev/null +++ b/swh/web/save_code_now/migrations/0013_saveoriginrequest_webhook_info.py @@ -0,0 +1,26 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("swh_web_save_code_now", "0012_saveoriginrequest_note"), + ] + + operations = [ + migrations.AddField( + model_name="saveoriginrequest", + name="from_webhook", + field=models.BooleanField(default=False), + ), + migrations.AddField( + model_name="saveoriginrequest", + name="webhook_origin", + field=models.CharField(max_length=200, null=True), + ), + ] diff --git a/swh/web/save_code_now/models.py b/swh/web/save_code_now/models.py index 7381b00e..f93d524f 100644 --- a/swh/web/save_code_now/models.py +++ b/swh/web/save_code_now/models.py @@ -1,135 +1,139 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.db import models from swh.web.utils.typing import SaveOriginRequestInfo class SaveAuthorizedOrigin(models.Model): """ Model table holding origin urls authorized to be loaded into the archive. """ url = models.CharField(max_length=200, null=False) class Meta: app_label = "swh_web_save_code_now" db_table = "save_authorized_origin" indexes = [models.Index(fields=["url"])] def __str__(self): return self.url class SaveUnauthorizedOrigin(models.Model): """ Model table holding origin urls not authorized to be loaded into the archive. """ url = models.CharField(max_length=200, null=False) class Meta: app_label = "swh_web_save_code_now" db_table = "save_unauthorized_origin" indexes = [models.Index(fields=["url"])] def __str__(self): return self.url SAVE_REQUEST_ACCEPTED = "accepted" SAVE_REQUEST_REJECTED = "rejected" SAVE_REQUEST_PENDING = "pending" SAVE_REQUEST_STATUS = [ (SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_ACCEPTED), (SAVE_REQUEST_REJECTED, SAVE_REQUEST_REJECTED), (SAVE_REQUEST_PENDING, SAVE_REQUEST_PENDING), ] SAVE_TASK_NOT_CREATED = "not created" SAVE_TASK_NOT_YET_SCHEDULED = "not yet scheduled" SAVE_TASK_SCHEDULED = "scheduled" SAVE_TASK_SUCCEEDED = "succeeded" SAVE_TASK_FAILED = "failed" SAVE_TASK_RUNNING = "running" SAVE_TASK_STATUS = [ (SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_CREATED), (SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED), (SAVE_TASK_SCHEDULED, SAVE_TASK_SCHEDULED), (SAVE_TASK_SUCCEEDED, SAVE_TASK_SUCCEEDED), (SAVE_TASK_FAILED, SAVE_TASK_FAILED), (SAVE_TASK_RUNNING, SAVE_TASK_RUNNING), ] VISIT_STATUS_CREATED = "created" VISIT_STATUS_ONGOING = "ongoing" VISIT_STATUS_FULL = "full" VISIT_STATUS_PARTIAL = "partial" VISIT_STATUS_NOT_FOUND = "not_found" VISIT_STATUS_FAILED = "failed" VISIT_STATUSES = [ (VISIT_STATUS_CREATED, VISIT_STATUS_CREATED), (VISIT_STATUS_ONGOING, VISIT_STATUS_ONGOING), (VISIT_STATUS_FULL, VISIT_STATUS_FULL), (VISIT_STATUS_PARTIAL, VISIT_STATUS_PARTIAL), (VISIT_STATUS_NOT_FOUND, VISIT_STATUS_NOT_FOUND), (VISIT_STATUS_FAILED, VISIT_STATUS_FAILED), ] class SaveOriginRequest(models.Model): """ Model table holding all the save origin requests issued by users. """ id = models.BigAutoField(primary_key=True) request_date = models.DateTimeField(auto_now_add=True) visit_type = models.CharField(max_length=200, null=False) visit_status = models.TextField(choices=VISIT_STATUSES, null=True) origin_url = models.CharField(max_length=200, null=False) status = models.TextField(choices=SAVE_REQUEST_STATUS, default=SAVE_REQUEST_PENDING) loading_task_id = models.IntegerField(default=-1) visit_date = models.DateTimeField(null=True) loading_task_status = models.TextField( choices=SAVE_TASK_STATUS, default=SAVE_TASK_NOT_CREATED ) # store ids of users that submitted the request as string list user_ids = models.TextField(null=True) note = models.TextField(null=True) + from_webhook = models.BooleanField(default=False) + webhook_origin = models.CharField(max_length=200, null=True) class Meta: app_label = "swh_web_save_code_now" db_table = "save_origin_request" ordering = ["-id"] indexes = [models.Index(fields=["origin_url", "status"])] def to_dict(self) -> SaveOriginRequestInfo: """Map the request save model object to a json serializable dict. Returns: The corresponding SaveOriginRequetsInfo json serializable dict. """ visit_date = self.visit_date return SaveOriginRequestInfo( id=self.id, origin_url=self.origin_url, visit_type=self.visit_type, save_request_date=self.request_date.isoformat(), save_request_status=self.status, save_task_status=self.loading_task_status, visit_status=self.visit_status, visit_date=visit_date.isoformat() if visit_date else None, loading_task_id=self.loading_task_id, note=self.note, + from_webhook=self.from_webhook, + webhook_origin=self.webhook_origin, ) def __str__(self) -> str: return str(self.to_dict()) diff --git a/swh/web/save_code_now/origin_save.py b/swh/web/save_code_now/origin_save.py index 4d03b6c7..b47ea644 100644 --- a/swh/web/save_code_now/origin_save.py +++ b/swh/web/save_code_now/origin_save.py @@ -1,839 +1,849 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from functools import lru_cache import json import logging from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse import requests from django.core.exceptions import ObjectDoesNotExist, ValidationError from django.core.validators import URLValidator from django.db.models import Q, QuerySet from django.utils.html import escape from swh.scheduler.utils import create_oneshot_task_dict from swh.web.config import get_config, scheduler from swh.web.save_code_now.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.utils import archive, parse_iso8601_date_to_utc from swh.web.utils.exc import ( BadInputExc, ForbiddenExc, NotFoundExc, sentry_capture_exception, ) from swh.web.utils.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo logger = logging.getLogger(__name__) # Number of days in the past to lookup for information MAX_THRESHOLD_DAYS = 30 # Non terminal visit statuses which needs updates NON_TERMINAL_STATUSES = [ VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, ] def get_origin_save_authorized_urls() -> List[str]: """ Get the list of origin url prefixes authorized to be immediately loaded into the archive (whitelist). Returns: list: The list of authorized origin url prefix """ return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] def get_origin_save_unauthorized_urls() -> List[str]: """ Get the list of origin url prefixes forbidden to be loaded into the archive (blacklist). Returns: list: the list of unauthorized origin url prefix """ return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] def can_save_origin(origin_url: str, bypass_pending_review: bool = False) -> str: """ Check if a software origin can be saved into the archive. Based on the origin url, the save request will be either: * immediately accepted if the url is whitelisted * rejected if the url is blacklisted * put in pending state for manual review otherwise Args: origin_url (str): the software origin url to check Returns: str: the origin save request status, either **accepted**, **rejected** or **pending** """ # origin url may be blacklisted for url_prefix in get_origin_save_unauthorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_REJECTED # if the origin url is in the white list, it can be immediately saved for url_prefix in get_origin_save_authorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_ACCEPTED # otherwise, the origin url needs to be manually verified if the user # that submitted it does not have special permission if bypass_pending_review: # mark the origin URL as trusted in that case SaveAuthorizedOrigin.objects.get_or_create(url=origin_url) return SAVE_REQUEST_ACCEPTED else: return SAVE_REQUEST_PENDING # map visit type to scheduler task # TODO: do not hardcode the task name here (T1157) _visit_type_task = { "git": "load-git", "hg": "load-hg", "svn": "load-svn", "cvs": "load-cvs", "bzr": "load-bzr", } _visit_type_task_privileged = { "archives": "load-archive-files", } # map scheduler task status to origin save status _save_task_status = { "next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED, "next_run_scheduled": SAVE_TASK_SCHEDULED, "completed": SAVE_TASK_SUCCEEDED, "disabled": SAVE_TASK_FAILED, } # map scheduler task_run status to origin save status _save_task_run_status = { "scheduled": SAVE_TASK_SCHEDULED, "started": SAVE_TASK_RUNNING, "eventful": SAVE_TASK_SUCCEEDED, "uneventful": SAVE_TASK_SUCCEEDED, "failed": SAVE_TASK_FAILED, "permfailed": SAVE_TASK_FAILED, "lost": SAVE_TASK_FAILED, } @lru_cache() def get_scheduler_load_task_types() -> List[str]: task_types = scheduler().get_task_types() return [t["type"] for t in task_types if t["type"].startswith("load")] def get_savable_visit_types_dict(privileged_user: bool = False) -> Dict: """Returned the supported task types the user has access to. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the dict of supported visit types for the user """ if privileged_user: task_types = {**_visit_type_task, **_visit_type_task_privileged} else: task_types = _visit_type_task # filter visit types according to scheduler load task types if available try: load_task_types = get_scheduler_load_task_types() return {k: v for k, v in task_types.items() if v in load_task_types} except Exception: return task_types def get_savable_visit_types(privileged_user: bool = False) -> List[str]: """Return the list of visit types the user can perform save requests on. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the list of saveable visit types """ return sorted(list(get_savable_visit_types_dict(privileged_user).keys())) def _check_visit_type_savable(visit_type: str, privileged_user: bool = False) -> None: visit_type_tasks = get_savable_visit_types(privileged_user) if visit_type not in visit_type_tasks: allowed_visit_types = ", ".join(visit_type_tasks) raise BadInputExc( f"Visit of type {visit_type} can not be saved! " f"Allowed types are the following: {allowed_visit_types}" ) _validate_url = URLValidator( schemes=["http", "https", "svn", "git", "rsync", "pserver", "ssh", "bzr"] ) def _check_origin_url_valid(origin_url: str) -> None: try: _validate_url(origin_url) except ValidationError: raise BadInputExc( f"The provided origin url ({escape(origin_url)}) is not valid!" ) parsed_url = urlparse(origin_url) if parsed_url.password not in (None, "", "anonymous"): raise BadInputExc( "The provided origin url contains a password and cannot be " "accepted for security reasons." ) def origin_exists(origin_url: str) -> OriginExistenceCheckInfo: """Check the origin url for existence. If it exists, extract some more useful information on the origin. """ resp = requests.head(origin_url, allow_redirects=True) exists = resp.ok content_length: Optional[int] = None last_modified: Optional[str] = None if exists: # Also process X-Archive-Orig-* headers in case the URL targets the # Internet Archive. size_ = resp.headers.get( "Content-Length", resp.headers.get("X-Archive-Orig-Content-Length") ) content_length = int(size_) if size_ else None try: date_str = resp.headers.get( "Last-Modified", resp.headers.get("X-Archive-Orig-Last-Modified", "") ) date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z") last_modified = date.isoformat() except ValueError: # if not provided or not parsable as per the expected format, keep it None pass return OriginExistenceCheckInfo( origin_url=origin_url, exists=exists, last_modified=last_modified, content_length=content_length, ) def _check_origin_exists(url: str) -> OriginExistenceCheckInfo: """Ensure an URL exists, if not raise an explicit message.""" metadata = origin_exists(url) if not metadata["exists"]: raise BadInputExc(f"The provided url ({escape(url)}) does not exist!") return metadata def _get_visit_info_for_save_request( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str]]: """Retrieve visit information out of a save request Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (visit date, optional visit status) for such save request origin """ visit_date = None visit_status = None time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # stop trying to find a visit date one month after save request submission # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: origin = save_request.origin_url ovs = archive.origin_visit_find_by_date(origin, save_request.request_date) if ovs: visit_date = parse_iso8601_date_to_utc(ovs["date"]) visit_status = ovs["status"] return visit_date, visit_status def _check_visit_update_status( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (optional visit date, optional visit status, optional save task status) for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: loading_task_status = SAVE_TASK_FAILED return visit_date, visit_status, loading_task_status def _compute_task_loading_status( task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> Optional[str]: loading_task_status: Optional[str] = None # First determine the loading task status out of task information if task: loading_task_status = _save_task_status[task["status"]] if task_run: loading_task_status = _save_task_run_status[task_run["status"]] return loading_task_status def _update_save_request_info( save_request: SaveOriginRequest, task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> SaveOriginRequestInfo: """Update save request information out of the visit status and fallback to the task and task_run information if the visit status is missing. Args: save_request: Save request task: Associated scheduler task information about the save request task_run: Most recent run occurrence of the associated task Returns: Summary of the save request information updated. """ must_save = False # To determine the save code now request's final status, the visit date must be set # and the visit status must be a final one. Once they do, the save code now is # definitely done. if ( not save_request.visit_date or not save_request.visit_status or save_request.visit_status in NON_TERMINAL_STATUSES ): visit_date, visit_status, loading_task_status = _check_visit_update_status( save_request ) if not loading_task_status: # fallback when not provided loading_task_status = _compute_task_loading_status(task, task_run) if visit_date != save_request.visit_date: must_save = True save_request.visit_date = visit_date if visit_status != save_request.visit_status: must_save = True save_request.visit_status = visit_status if ( loading_task_status is not None and loading_task_status != save_request.loading_task_status ): must_save = True save_request.loading_task_status = loading_task_status if must_save: save_request.save() return save_request.to_dict() def create_save_origin_request( visit_type: str, origin_url: str, privileged_user: bool = False, user_id: Optional[int] = None, + from_webhook: bool = False, + webhook_origin: Optional[str] = None, **kwargs, ) -> SaveOriginRequestInfo: """Create a loading task to save a software origin into the archive. This function aims to create a software origin loading task through the use of the swh-scheduler component. First, some checks are performed to see if the visit type and origin url are valid but also if the the save request can be accepted. For the 'archives' visit type, this also ensures the artifacts actually exists. If those checks passed, the loading task is then created. Otherwise, the save request is put in pending or rejected state. All the submitted save requests are logged into the swh-web database to keep track of them. Args: visit_type: the type of visit to perform (e.g. git, hg, svn, archives, ...) origin_url: the url of the origin to save privileged: Whether the user has some more privilege than other (bypass review, access to privileged other visit types) user_id: User identifier (provided when authenticated) + from_webhook: Indicates if the save request is created from a webhook receiver + webhook_origin: Indicates which forge type sent the webhook kwargs: Optional parameters (e.g. artifact_url, artifact_filename, artifact_version) Raises: BadInputExc: the visit type or origin url is invalid or inexistent ForbiddenExc: the provided origin url is blacklisted Returns: dict: A dict describing the save request with the following keys: * **visit_type**: the type of visit to perform * **origin_url**: the url of the origin * **save_request_date**: the date the request was submitted * **save_request_status**: the request status, either **accepted**, **rejected** or **pending** * **save_task_status**: the origin loading task status, either **not created**, **not yet scheduled**, **scheduled**, **succeed** or **failed** """ visit_type_tasks = get_savable_visit_types_dict(privileged_user) _check_visit_type_savable(visit_type, privileged_user) _check_origin_url_valid(origin_url) # if all checks passed so far, we can try and save the origin save_request_status = can_save_origin(origin_url, privileged_user) task = None # if the origin save request is accepted, create a scheduler # task to load it into the archive if save_request_status == SAVE_REQUEST_ACCEPTED: # create a task with high priority task_kwargs: Dict[str, Any] = { "priority": "high", "url": origin_url, } if visit_type == "archives": # extra arguments for that type are required archives_data = kwargs.get("archives_data", []) if not archives_data: raise BadInputExc( "Artifacts data are missing for the archives visit type." ) artifacts = [] for artifact in archives_data: artifact_url = artifact.get("artifact_url") artifact_version = artifact.get("artifact_version") if not artifact_url or not artifact_version: raise BadInputExc("Missing url or version for an artifact to load.") metadata = _check_origin_exists(artifact_url) artifacts.append( { "url": artifact_url, "version": artifact_version, "time": metadata["last_modified"], "length": metadata["content_length"], } ) task_kwargs = dict(**task_kwargs, artifacts=artifacts, snapshot_append=True) sor = None # get list of previously submitted save requests (most recent first) current_sors = list( SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ).order_by("-request_date") ) can_create_task = False # if no save requests previously submitted, create the scheduler task if not current_sors: can_create_task = True else: # get the latest submitted save request sor = current_sors[0] # if it was in pending state, we need to create the scheduler task # and update the save request info in the database if sor.status == SAVE_REQUEST_PENDING: can_create_task = True # a task has already been created to load the origin elif sor.loading_task_id != -1: # get the scheduler task and its status tasks = scheduler().get_tasks([sor.loading_task_id]) task = tasks[0] if tasks else None task_runs = scheduler().get_task_runs([sor.loading_task_id]) task_run = task_runs[0] if task_runs else None save_request_info = _update_save_request_info(sor, task, task_run) task_status = save_request_info["save_task_status"] # create a new scheduler task only if the previous one has been # already or is currently executed if task_status in ( SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED, SAVE_TASK_RUNNING, ): can_create_task = True sor = None else: can_create_task = False if can_create_task: # effectively create the scheduler task task_dict = create_oneshot_task_dict( visit_type_tasks[visit_type], **task_kwargs ) task = scheduler().create_tasks([task_dict])[0] # pending save request has been accepted if sor: sor.status = SAVE_REQUEST_ACCEPTED sor.loading_task_id = task["id"] sor.save() else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, loading_task_id=task["id"], user_ids=f'"{user_id}"' if user_id else None, + from_webhook=from_webhook, + webhook_origin=webhook_origin, ) # save request must be manually reviewed for acceptation elif save_request_status == SAVE_REQUEST_PENDING: # check if there is already such a save request already submitted, # no need to add it to the database in that case try: sor = SaveOriginRequest.objects.get( visit_type=visit_type, origin_url=origin_url, status=save_request_status ) user_ids = sor.user_ids if sor.user_ids is not None else "" if user_id is not None and f'"{user_id}"' not in user_ids: # update user ids list sor.user_ids = f'{sor.user_ids},"{user_id}"' sor.save() # if not add it to the database except ObjectDoesNotExist: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, + from_webhook=from_webhook, + webhook_origin=webhook_origin, ) # origin can not be saved as its url is blacklisted, # log the request to the database anyway else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, + from_webhook=from_webhook, + webhook_origin=webhook_origin, ) if save_request_status == SAVE_REQUEST_REJECTED: raise ForbiddenExc( ( 'The "save code now" request has been rejected ' "because the provided origin url is blacklisted." ) ) assert sor is not None return _update_save_request_info(sor, task) def update_save_origin_requests_from_queryset( requests_queryset: QuerySet, ) -> List[SaveOriginRequestInfo]: """Update all save requests from a SaveOriginRequest queryset, update their status in db and return the list of impacted save_requests. Args: requests_queryset: input SaveOriginRequest queryset Returns: list: A list of save origin request info dicts as described in :func:`swh.web.save_code_now.origin_save.create_save_origin_request` """ task_ids = [] for sor in requests_queryset: task_ids.append(sor.loading_task_id) save_requests = [] if task_ids: try: tasks = scheduler().get_tasks(task_ids) tasks = {task["id"]: task for task in tasks} task_runs = scheduler().get_task_runs(tasks) task_runs = {task_run["task"]: task_run for task_run in task_runs} except Exception: # allow to avoid mocking api GET responses for /origin/save endpoint when # running cypress tests as scheduler is not available tasks = {} task_runs = {} for sor in requests_queryset: sr_dict = _update_save_request_info( sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id), ) save_requests.append(sr_dict) return save_requests def refresh_save_origin_request_statuses() -> List[SaveOriginRequestInfo]: """Refresh non-terminal save origin requests (SOR) in the backend. Non-terminal SOR are requests whose status is **accepted** and their task status are either **created**, **not yet scheduled**, **scheduled** or **running**. This shall compute this list of SOR, checks their status in the scheduler and optionally elasticsearch for their current status. Then update those in db. Finally, this returns the refreshed information on those SOR. """ pivot_date = datetime.now(tz=timezone.utc) - timedelta(days=MAX_THRESHOLD_DAYS) save_requests = SaveOriginRequest.objects.filter( # Retrieve accepted request statuses (all statuses) Q(status=SAVE_REQUEST_ACCEPTED), # those without the required information we need to update Q(visit_date__isnull=True) | Q(visit_status__isnull=True) | Q(visit_status__in=NON_TERMINAL_STATUSES), # limit results to recent ones (that is roughly 30 days old at best) Q(request_date__gte=pivot_date), ) return ( update_save_origin_requests_from_queryset(save_requests) if save_requests.count() > 0 else [] ) def get_save_origin_requests( visit_type: str, origin_url: str ) -> List[SaveOriginRequestInfo]: """ Get all save requests for a given software origin. Args: visit_type: the type of visit origin_url: the url of the origin Raises: BadInputExc: the visit type or origin url is invalid swh.web.utils.exc.NotFoundExc: no save requests can be found for the given origin Returns: list: A list of save origin requests dict as described in :func:`swh.web.save_code_now.origin_save.create_save_origin_request` """ _check_visit_type_savable(visit_type) _check_origin_url_valid(origin_url) sors = SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ) if sors.count() == 0: raise NotFoundExc( f"No save requests found for visit of type {visit_type} " f"on origin with url {origin_url}." ) return update_save_origin_requests_from_queryset(sors) def get_save_origin_task_info( save_request_id: int, full_info: bool = True ) -> Dict[str, Any]: """ Get detailed information about an accepted save origin request and its associated loading task. If the associated loading task info is archived and removed from the scheduler database, returns an empty dictionary. Args: save_request_id: identifier of a save origin request full_info: whether to return detailed info for staff users Returns: A dictionary with the following keys: - **type**: loading task type - **arguments**: loading task arguments - **id**: loading task database identifier - **backend_id**: loading task celery identifier - **scheduled**: loading task scheduling date - **ended**: loading task termination date - **status**: loading task execution status - **visit_status**: Actual visit status Depending on the availability of the task logs in the elasticsearch cluster of Software Heritage, the returned dictionary may also contain the following keys: - **name**: associated celery task name - **message**: relevant log message from task execution - **duration**: task execution time (only if it succeeded) - **worker**: name of the worker that executed the task """ try: save_request = SaveOriginRequest.objects.get(id=save_request_id) except ObjectDoesNotExist: return {} task_info: Dict[str, Any] = {} if save_request.note is not None: task_info["note"] = save_request.note try: task = scheduler().get_tasks([save_request.loading_task_id]) except Exception: # to avoid mocking GET responses of /save/task/info/ endpoint when running # cypress tests as scheduler is not available in that case task = None task = task[0] if task else None if task is None: return task_info task_run = scheduler().get_task_runs([task["id"]]) task_run = task_run[0] if task_run else None if task_run is None: return task_info task_info.update(task_run) task_info["type"] = task["type"] task_info["arguments"] = task["arguments"] task_info["id"] = task_run["task"] del task_info["task"] del task_info["metadata"] # Enrich the task info with the loading visit status task_info["visit_status"] = save_request.visit_status es_workers_index_url = get_config()["es_workers_index_url"] if not es_workers_index_url: return task_info es_workers_index_url += "/_search" if save_request.visit_date: min_ts = save_request.visit_date max_ts = min_ts + timedelta(days=7) else: min_ts = save_request.request_date max_ts = min_ts + timedelta(days=MAX_THRESHOLD_DAYS) min_ts_unix = int(min_ts.timestamp()) * 1000 max_ts_unix = int(max_ts.timestamp()) * 1000 save_task_status = _save_task_status[task["status"]] priority = "3" if save_task_status == SAVE_TASK_FAILED else "6" query = { "bool": { "must": [ {"match_phrase": {"syslog.priority": {"query": priority}}}, { "match_phrase": { "journald.custom.swh_task_id": {"query": task_run["backend_id"]} } }, { "range": { "@timestamp": { "gte": min_ts_unix, "lte": max_ts_unix, "format": "epoch_millis", } } }, ] } } try: response = requests.post( es_workers_index_url, json={"query": query, "sort": ["@timestamp"]}, timeout=30, ) results = json.loads(response.text) if results["hits"]["total"]["value"] >= 1: task_run_info = results["hits"]["hits"][-1]["_source"] journald_custom = task_run_info.get("journald", {}).get("custom", {}) task_info["duration"] = journald_custom.get( "swh_logging_args_runtime", "not available" ) task_info["message"] = task_run_info.get("message", "not available") task_info["name"] = journald_custom.get("swh_task_name", "not available") task_info["worker"] = task_run_info.get("host", {}).get("hostname") except Exception as exc: logger.warning("Request to Elasticsearch failed\n%s", exc) sentry_capture_exception(exc) if not full_info: for field in ("id", "backend_id", "worker"): # remove some staff only fields task_info.pop(field, None) if "message" in task_run and "Loading failure" in task_run["message"]: # hide traceback for non staff users, only display exception message_lines = task_info["message"].split("\n") message = "" for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] task_info["message"] = message return task_info diff --git a/swh/web/save_code_now/tests/test_migrations.py b/swh/web/save_code_now/tests/test_migrations.py index 37edeafd..5dc0ed68 100644 --- a/swh/web/save_code_now/tests/test_migrations.py +++ b/swh/web/save_code_now/tests/test_migrations.py @@ -1,60 +1,79 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information APP_NAME = "swh_web_save_code_now" MIGRATION_0008 = "0008_save-code-now_indexes_20210106_1327" MIGRATION_0009 = "0009_saveoriginrequest_visit_status" MIGRATION_0010 = "0010_saveoriginrequest_user_id" MIGRATION_0011 = "0011_saveoriginrequest_user_ids" MIGRATION_0012 = "0012_saveoriginrequest_note" +MIGRATION_0013 = "0013_saveoriginrequest_webhook_info" def test_migrations_09_add_visit_status_to_sor_model(migrator): """Ensures the migration adds the visit_status field to SaveOriginRequest table""" old_state = migrator.apply_initial_migration( (APP_NAME, MIGRATION_0008), ) old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(old_model, "visit_status") is False new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0009)) new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(new_model, "visit_status") is True def test_migrations_10_add_user_id_to_sor_model(migrator): """Ensures the migration adds the user_id field to SaveOriginRequest table""" old_state = migrator.apply_initial_migration( (APP_NAME, MIGRATION_0009), ) old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(old_model, "user_id") is False new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0010)) new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(new_model, "user_id") is True def test_migrations_12_add_note_to_sor_model(migrator): """Ensures the migration adds the user_id field to SaveOriginRequest table""" old_state = migrator.apply_initial_migration( (APP_NAME, MIGRATION_0011), ) old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(old_model, "note") is False new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0012)) new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(new_model, "note") is True + + +def test_migrations_13_add_webhook_info_to_sor_model(migrator): + """Ensures the migration adds the from_webhook field to SaveOriginRequest table""" + + old_state = migrator.apply_initial_migration( + (APP_NAME, MIGRATION_0012), + ) + old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") + + assert hasattr(old_model, "from_webhook") is False + assert hasattr(old_model, "webhook_origin") is False + + new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0013)) + new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") + + assert hasattr(new_model, "from_webhook") is True + assert hasattr(new_model, "webhook_origin") is True diff --git a/swh/web/save_code_now/tests/test_origin_save.py b/swh/web/save_code_now/tests/test_origin_save.py index d0e60d8f..207e3752 100644 --- a/swh/web/save_code_now/tests/test_origin_save.py +++ b/swh/web/save_code_now/tests/test_origin_save.py @@ -1,789 +1,791 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from functools import partial import re from typing import Optional import uuid import iso8601 import pytest import requests from swh.core.pytest_plugin import get_response_cb from swh.scheduler.utils import create_oneshot_task_dict from swh.web.config import get_config from swh.web.save_code_now.models import ( SAVE_REQUEST_ACCEPTED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_FULL, VISIT_STATUS_ONGOING, VISIT_STATUS_PARTIAL, SaveOriginRequest, ) from swh.web.save_code_now.origin_save import ( _check_origin_exists, _check_visit_type_savable, _visit_type_task, _visit_type_task_privileged, get_savable_visit_types, get_save_origin_requests, get_save_origin_task_info, origin_exists, refresh_save_origin_request_statuses, ) from swh.web.utils.exc import BadInputExc from swh.web.utils.typing import ( OriginExistenceCheckInfo, OriginVisitInfo, SaveOriginRequestInfo, ) _es_url = "http://esnode1.internal.softwareheritage.org:9200" _es_workers_index_url = "%s/swh_workers-*" % _es_url _origin_url = "https://gitlab.com/inkscape/inkscape" _visit_type = "git" _task_id = 1 @pytest.fixture(autouse=True) def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with post method""" cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.post(re.compile("https?://"), body=cb) return requests_mock_datadir @pytest.mark.django_db def test_get_save_origin_archived_task_info(swh_scheduler): _get_save_origin_task_info_test(swh_scheduler, task_archived=True) @pytest.mark.django_db def test_get_save_origin_task_info_without_es(swh_scheduler): _get_save_origin_task_info_test(swh_scheduler, es_available=False) def _fill_scheduler_db( swh_scheduler, task_status="completed", task_run_status="eventful", task_archived=False, visit_started_date=None, ): task = task_run = None if not task_archived: task = swh_scheduler.create_tasks( [create_oneshot_task_dict("load-git", repo_url=_origin_url)] )[0] backend_id = str(uuid.uuid4()) if task_status != "next_run_not_scheduled": swh_scheduler.schedule_task_run(task["id"], backend_id) if task_run_status is not None: swh_scheduler.start_task_run(backend_id) task_run = dict( swh_scheduler.end_task_run(backend_id, task_run_status).items() ) return task, task_run @pytest.mark.parametrize( "wrong_type,privileged_user", [ ("dummy", True), ("dumb", False), ("archives", False), # when no privilege, this is rejected ], ) def test_check_visit_type_savable(wrong_type, privileged_user, swh_scheduler): swh_scheduler.add_load_archive_task_type() with pytest.raises(BadInputExc, match="Allowed types"): _check_visit_type_savable(wrong_type, privileged_user) # when privileged_user, the following is accepted though _check_visit_type_savable("archives", True) def test_get_savable_visit_types(swh_scheduler): swh_scheduler.add_load_archive_task_type() default_list = list(_visit_type_task.keys()) assert set(get_savable_visit_types()) == set(default_list) privileged_list = default_list.copy() privileged_list += list(_visit_type_task_privileged.keys()) assert set(get_savable_visit_types(privileged_user=True)) == set(privileged_list) def _get_save_origin_task_info_test( swh_scheduler, task_archived=False, es_available=True, full_info=True ): swh_web_config = get_config() if es_available: swh_web_config.update({"es_workers_index_url": _es_workers_index_url}) else: swh_web_config.update({"es_workers_index_url": ""}) sor = SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, origin_url="https://gitlab.com/inkscape/inkscape", status=SAVE_REQUEST_ACCEPTED, visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), loading_task_id=_task_id, ) task, task_run = _fill_scheduler_db(swh_scheduler, task_archived=task_archived) es_response = requests.post("%s/_search" % _es_workers_index_url).json() task_exec_data = es_response["hits"]["hits"][-1]["_source"] sor_task_info = get_save_origin_task_info(sor.id, full_info=full_info) expected_result = ( { "type": task["type"], "arguments": task["arguments"], "id": task["id"], "backend_id": task_run["backend_id"], "scheduled": task_run["scheduled"], "started": task_run["started"], "ended": task_run["ended"], "status": task_run["status"], "visit_status": sor.visit_status, } if not task_archived else {} ) if es_available and not task_archived: expected_result.update( { "message": task_exec_data["message"], "name": task_exec_data["swh_task_name"], "worker": task_exec_data["hostname"], } ) if not full_info: expected_result.pop("id", None) expected_result.pop("backend_id", None) expected_result.pop("worker", None) if "message" in expected_result: message = "" message_lines = expected_result["message"].split("\n") for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] expected_result["message"] = message assert sor_task_info == expected_result @pytest.mark.django_db def test_get_save_origin_requests_find_visit_date(mocker, swh_scheduler): # create a save request SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archive _fill_scheduler_db(swh_scheduler) mock_archive = mocker.patch("swh.web.save_code_now.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # check visit date has been correctly found sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] == visit_date mock_archive.origin_visit_find_by_date.assert_called_once() # check visit is not searched again when it has been found get_save_origin_requests(_visit_type, _origin_url) mock_archive.origin_visit_find_by_date.assert_called_once() # check visit date are not searched for save requests older than # one month sor = SaveOriginRequest.objects.create( visit_type=_visit_type, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, loading_task_id=_task_id, visit_date=None, ) sor.request_date = datetime.now(tz=timezone.utc) - timedelta(days=31) sor.save() _fill_scheduler_db(swh_scheduler, task_status="disabled", task_run_status="failed") sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 2 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_date"] is None mock_archive.origin_visit_find_by_date.assert_called_once() def _get_save_origin_requests( mocker, swh_scheduler, load_status, visit_status, request_date: Optional[datetime] = None, ): """Wrapper around the get_origin_save_origin_request call.""" SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=visit_status, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status=load_status ) mock_archive = mocker.patch("swh.web.save_code_now.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=visit_status, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info sors = get_save_origin_requests(_visit_type, _origin_url) mock_archive.origin_visit_find_by_date.assert_called_once() return sors @pytest.mark.parametrize("visit_date", [None, "some-date"]) def test_from_save_origin_request_to_save_request_info_dict(visit_date): """Ensure save request to json serializable dict is fine""" request_date = datetime.now(tz=timezone.utc) _visit_date = request_date + timedelta(minutes=5) if visit_date else None request_date = datetime.now(tz=timezone.utc) note = "request succeeded" sor = SaveOriginRequest( request_date=request_date, visit_type=_visit_type, visit_status=VISIT_STATUS_FULL, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, loading_task_status=None, visit_date=_visit_date, loading_task_id=1, note=note, ) assert sor.to_dict() == SaveOriginRequestInfo( id=sor.id, origin_url=sor.origin_url, visit_type=sor.visit_type, save_request_date=sor.request_date.isoformat(), save_request_status=sor.status, save_task_status=sor.loading_task_status, visit_status=sor.visit_status, visit_date=_visit_date.isoformat() if _visit_date else None, loading_task_id=sor.loading_task_id, note=note, + from_webhook=False, + webhook_origin=None, ) def test__check_origin_exists_404(requests_mock): url_ko = "https://example.org/some-inexistant-url" requests_mock.head(url_ko, status_code=404) with pytest.raises(BadInputExc, match="not exist"): _check_origin_exists(url_ko) def test__check_origin_exists_200(requests_mock): url = "https://example.org/url" requests_mock.head(url, status_code=200) # passes the check actual_metadata = _check_origin_exists(url) # and we actually may have retrieved some metadata on the origin assert actual_metadata == origin_exists(url) def test_origin_exists_404(requests_mock): """Origin which does not exist should be reported as inexistent""" url_ko = "https://example.org/some-inexistant-url" requests_mock.head(url_ko, status_code=404) actual_result = origin_exists(url_ko) assert actual_result == OriginExistenceCheckInfo( origin_url=url_ko, exists=False, last_modified=None, content_length=None, ) def test_origin_exists_200_no_data(requests_mock): """Existing origin should be reported as such (no extra information)""" url = "http://example.org/real-url" requests_mock.head( url, status_code=200, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, last_modified=None, content_length=None, ) def test_origin_exists_200_with_data(requests_mock): """Existing origin should be reported as such (+ extra information)""" url = "http://example.org/real-url" requests_mock.head( url, status_code=200, headers={ "content-length": "10", "last-modified": "Sun, 21 Aug 2011 16:26:32 GMT", }, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=10, last_modified="2011-08-21T16:26:32", ) def test_origin_exists_internet_archive(requests_mock): """Edge case where an artifact URL to check existence is hosted on the Internet Archive""" url = ( "https://web.archive.org/web/20100705043309/" "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" ) redirect_url = ( "https://web.archive.org/web/20100610004108/" "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" ) requests_mock.head( url, status_code=302, headers={ "Location": redirect_url, }, ) requests_mock.head( redirect_url, status_code=200, headers={ "X-Archive-Orig-Last-Modified": "Tue, 12 May 2009 22:09:43 GMT", "X-Archive-Orig-Content-Length": "121421", }, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=121421, last_modified="2009-05-12T22:09:43", ) def test_origin_exists_200_with_data_unexpected_date_format(requests_mock): """Existing origin should be ok, unexpected last modif time result in no time""" url = "http://example.org/real-url2" # this is parsable but not as expected unexpected_format_date = "Sun, 21 Aug 2021 16:26:32" requests_mock.head( url, status_code=200, headers={ "last-modified": unexpected_format_date, }, ) actual_result = origin_exists(url) # so the resulting date is None assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=None, last_modified=None, ) @pytest.mark.django_db @pytest.mark.parametrize( "visit_status", [ VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, ], ) def test_get_save_origin_requests_no_visit_date_found( mocker, swh_scheduler, visit_status ): """Uneventful visits with failed visit status are marked as failed""" sors = _get_save_origin_requests( mocker, swh_scheduler, load_status="scheduled", visit_status=visit_status, ) # check no visit date has been found assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_RUNNING assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize( "visit_status", [ "not_found", "failed", ], ) def test_get_save_origin_requests_no_failed_status_override( mocker, swh_scheduler, visit_status ): """Uneventful visits with failed statuses (failed, not found) are marked as failed""" sors = _get_save_origin_requests( mocker, swh_scheduler, load_status="uneventful", visit_status=visit_status ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED visit_date = sors[0]["visit_date"] assert visit_date is not None sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize( "load_status,visit_status", [ ("eventful", VISIT_STATUS_FULL), ("eventful", VISIT_STATUS_PARTIAL), ("uneventful", VISIT_STATUS_PARTIAL), ], ) def test_get_visit_info_for_save_request_succeeded( mocker, swh_scheduler, load_status, visit_status ): """Nominal scenario, below 30 days, returns something""" sors = _get_save_origin_requests( mocker, swh_scheduler, load_status=load_status, visit_status=visit_status ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status sors = get_save_origin_requests(_visit_type, _origin_url) assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize( "load_status", [ "eventful", "uneventful", ], ) def test_get_visit_info_incomplete_visit_still_successful( mocker, swh_scheduler, load_status ): """Incomplete visit information, yet the task is updated partially""" sors = _get_save_origin_requests( mocker, swh_scheduler, load_status=load_status, visit_status=None, ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED # As the entry is missing the following information though assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None # It's still detected as to be updated by the refresh routine sors = refresh_save_origin_request_statuses() assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None @pytest.mark.django_db def test_refresh_in_progress_save_request_statuses( mocker, swh_scheduler, api_client, archive_data ): """Refresh a pending save origins requests and update if the status changes""" date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) visit_started_date = date_now - timedelta(minutes=1) # returned visit status SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=VISIT_STATUS_CREATED, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED, ) mock_archive = mocker.patch("swh.web.save_code_now.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_CREATED, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # make the scheduler return a running event _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status="started", visit_started_date=visit_started_date, ) # The visit is detected but still running sors = refresh_save_origin_request_statuses() assert ( mock_archive.origin_visit_find_by_date.called and mock_archive.origin_visit_find_by_date.call_count == 1 ) assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # The status is updated assert sor["save_task_status"] == SAVE_TASK_RUNNING # but the following entries are missing so it's not updated assert sor["visit_date"] is not None assert sor["visit_status"] == VISIT_STATUS_CREATED # make the visit status completed # make the scheduler return a running event _fill_scheduler_db( swh_scheduler, task_status="completed", task_run_status="eventful", visit_started_date=visit_started_date, ) # This time around, the origin returned will have all required information updated # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) mock_archive.origin_visit_find_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 assert ( mock_archive.origin_visit_find_by_date.called and mock_archive.origin_visit_find_by_date.call_count == 1 + 1 ) for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_FULL # Once in final state, a sor should not be updated anymore sors = refresh_save_origin_request_statuses() assert len(sors) == 0 @pytest.mark.django_db def test_refresh_save_request_statuses(mocker, swh_scheduler, api_client, archive_data): """Refresh filters save origins requests and update if changes""" date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) # returned visit status SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=None, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED, ) mock_archive = mocker.patch("swh.web.save_code_now.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_CREATED, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_CREATED # A save code now entry is detected for update, but as nothing changes, the entry # remains in the same state sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # Status is not updated as no new information is available on the visit status # and the task status has not moved assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_CREATED # This time around, the origin returned will have all information updated # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_FULL # This time, nothing left to update sors = refresh_save_origin_request_statuses() assert len(sors) == 0 diff --git a/swh/web/save_code_now/tests/test_origin_save_api.py b/swh/web/save_code_now/tests/test_origin_save_api.py index 04d0450d..51ddf0be 100644 --- a/swh/web/save_code_now/tests/test_origin_save_api.py +++ b/swh/web/save_code_now/tests/test_origin_save_api.py @@ -1,703 +1,707 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta import uuid import pytest from django.core.exceptions import ObjectDoesNotExist from django.utils import timezone from swh.web.api.throttling import SwhWebUserRateThrottle from swh.web.auth.utils import API_SAVE_ORIGIN_PERMISSION, SWH_AMBASSADOR_PERMISSION from swh.web.save_code_now.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_FAILED, VISIT_STATUS_FULL, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.settings.tests import save_origin_rate_post from swh.web.tests.helpers import ( check_api_get_responses, check_api_post_response, check_api_post_responses, create_django_permission, ) from swh.web.utils import reverse from swh.web.utils.typing import OriginExistenceCheckInfo pytestmark = pytest.mark.django_db @pytest.fixture(autouse=True) def populated_db(): SaveAuthorizedOrigin.objects.create(url="https://github.com/"), SaveAuthorizedOrigin.objects.create(url="https://gitlab.com/"), SaveUnauthorizedOrigin.objects.create(url="https://github.com/user/illegal_repo") SaveUnauthorizedOrigin.objects.create(url="https://gitlab.com/user_to_exclude") def test_invalid_visit_type(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={ "visit_type": "foo", "origin_url": "https://github.com/torvalds/linux", }, ) check_api_get_responses(api_client, url, status_code=400) def test_invalid_origin_url(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": "bar"} ) check_api_get_responses(api_client, url, status_code=400) def check_created_save_request_status( api_client, mocker, origin_url, expected_request_status, expected_task_status=None, visit_date=None, ): mock_origin_exists = mocker.patch("swh.web.save_code_now.origin_save.origin_exists") mock_origin_exists.return_value = OriginExistenceCheckInfo( origin_url=origin_url, exists=True, last_modified=None, content_length=None ) url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url} ) mock_visit_date = mocker.patch( ("swh.web.save_code_now.origin_save._get_visit_info_for_save_request") ) mock_visit_date.return_value = (visit_date, None) if expected_request_status != SAVE_REQUEST_REJECTED: response = check_api_post_responses(api_client, url, data=None, status_code=200) assert response.data["save_request_status"] == expected_request_status assert response.data["save_task_status"] == expected_task_status + assert response.data["from_webhook"] is False + assert response.data["webhook_origin"] is None else: check_api_post_responses(api_client, url, data=None, status_code=403) def check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status, expected_task_status, scheduler_task_status="next_run_not_scheduled", scheduler_task_run_status=None, visit_date=None, visit_status=None, ): if expected_task_status != SAVE_TASK_NOT_CREATED: task = dict(swh_scheduler.search_tasks()[0].items()) backend_id = str(uuid.uuid4()) if scheduler_task_status != "next_run_not_scheduled": swh_scheduler.schedule_task_run(task["id"], backend_id) if scheduler_task_run_status is not None: swh_scheduler.start_task_run(backend_id) task_run = dict( swh_scheduler.end_task_run(backend_id, scheduler_task_run_status).items() ) url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url} ) mock_visit_date = mocker.patch( ("swh.web.save_code_now.origin_save._get_visit_info_for_save_request") ) mock_visit_date.return_value = (visit_date, visit_status) response = check_api_get_responses(api_client, url, status_code=200) save_request_data = response.data[0] assert save_request_data["save_request_status"] == expected_request_status assert save_request_data["save_task_status"] == expected_task_status assert save_request_data["visit_status"] == visit_status + assert save_request_data["from_webhook"] is False + assert save_request_data["webhook_origin"] is None if scheduler_task_run_status is not None: # Check that save task status is still available when # the scheduler task has been archived swh_scheduler.delete_archived_tasks( [{"task_id": task["id"], "task_run_id": task_run["id"]}] ) response = check_api_get_responses(api_client, url, status_code=200) save_request_data = response.data[0] assert save_request_data["save_task_status"] == expected_task_status assert save_request_data["visit_status"] == visit_status def test_save_request_rejected(api_client, mocker, swh_scheduler): origin_url = "https://github.com/user/illegal_repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_REJECTED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_REJECTED, expected_task_status=SAVE_TASK_NOT_CREATED, ) def test_save_request_pending(api_client, mocker, swh_scheduler): origin_url = "https://unkwownforge.com/user/repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_PENDING, expected_task_status=SAVE_TASK_NOT_CREATED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_PENDING, expected_task_status=SAVE_TASK_NOT_CREATED, ) def test_save_request_scheduled(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, scheduler_task_status="next_run_scheduled", scheduler_task_run_status="scheduled", ) def test_save_request_completed(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SUCCEEDED, scheduler_task_status="completed", scheduler_task_run_status="eventful", visit_date=None, ) def test_save_request_completed_visit_status(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) visit_date = datetime.now(tz=timezone.utc) + timedelta(hours=1) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SUCCEEDED, scheduler_task_status="completed", scheduler_task_run_status="eventful", visit_date=visit_date, visit_status=VISIT_STATUS_FULL, ) def test_save_request_failed(api_client, mocker, swh_scheduler): origin_url = "https://gitlab.com/inkscape/inkscape" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_FAILED, scheduler_task_status="disabled", scheduler_task_run_status="failed", visit_status=VISIT_STATUS_FAILED, ) def test_create_save_request_no_duplicate_if_already_scheduled( api_client, mocker, swh_scheduler ): origin_url = "https://github.com/webpack/webpack" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 1 check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, scheduler_task_status="next_run_scheduled", scheduler_task_run_status="scheduled", ) check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 1 def test_create_save_request_if_previous_one_is_running( api_client, mocker, swh_scheduler ): origin_url = "https://github.com/webpack/webpack" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_RUNNING, scheduler_task_status="next_run_scheduled", scheduler_task_run_status="started", ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 1 check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 2 def test_get_save_requests_unknown_origin(api_client, swh_scheduler): unknown_origin_url = "https://gitlab.com/foo/bar" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": unknown_origin_url}, ) response = check_api_get_responses(api_client, url, status_code=404) assert response.data == { "exception": "NotFoundExc", "reason": ( "No save requests found for visit of type git on origin with url %s." ) % unknown_origin_url, } _visit_type = "git" _origin_url = "https://github.com/python/cpython" def test_save_requests_rate_limit(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) for _ in range(save_origin_rate_post): check_api_post_response(api_client, url, status_code=200) check_api_post_response(api_client, url, status_code=429) def test_save_requests_no_rate_limit_if_permission( api_client, regular_user, swh_scheduler ): regular_user.user_permissions.add( create_django_permission(API_SAVE_ORIGIN_PERMISSION) ) assert regular_user.has_perm(API_SAVE_ORIGIN_PERMISSION) api_client.force_login(regular_user) url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) for _ in range(save_origin_rate_post * SwhWebUserRateThrottle.NUM_REQUESTS_FACTOR): check_api_post_response(api_client, url, status_code=200) check_api_post_response(api_client, url, status_code=200) def test_save_request_unknown_repo_with_permission( api_client, regular_user, mocker, swh_scheduler ): regular_user.user_permissions.add( create_django_permission(API_SAVE_ORIGIN_PERMISSION) ) assert regular_user.has_perm(API_SAVE_ORIGIN_PERMISSION) api_client.force_login(regular_user) origin_url = "https://unkwownforge.org/user/repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) def test_save_request_form_server_error(api_client, mocker): create_save_origin_request = mocker.patch( "swh.web.save_code_now.api_views.create_save_origin_request" ) create_save_origin_request.side_effect = Exception("Server error") url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) check_api_post_responses(api_client, url, status_code=500) @pytest.fixture def origin_to_review(): return "https://git.example.org/user/project" def test_create_save_request_pending_review_anonymous_user( api_client, origin_to_review, swh_scheduler ): url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_to_review}, ) response = check_api_post_responses(api_client, url, status_code=200) assert response.data["save_request_status"] == SAVE_REQUEST_PENDING with pytest.raises(ObjectDoesNotExist): SaveAuthorizedOrigin.objects.get(url=origin_to_review) def test_create_save_request_archives_with_ambassador_user( api_client, keycloak_oidc, requests_mock, swh_scheduler, ): swh_scheduler.add_load_archive_task_type() keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") originUrl = "https://somewhere.org/simple" artifact_version = "1.2.3" artifact_filename = f"tarball-{artifact_version}.tar.gz" artifact_url = f"{originUrl}/{artifact_filename}" content_length = "100" last_modified = "Sun, 21 Aug 2011 16:26:32 GMT" requests_mock.head( artifact_url, status_code=200, headers={ "content-length": content_length, "last-modified": last_modified, }, ) url = reverse( "api-1-save-origin", url_args={ "visit_type": "archives", "origin_url": originUrl, }, ) response = check_api_post_response( api_client, url, status_code=200, data={ "archives_data": [ { "artifact_url": artifact_url, "artifact_version": artifact_version, } ] }, ) assert response.data["save_request_status"] == SAVE_REQUEST_ACCEPTED assert SaveAuthorizedOrigin.objects.get(url=originUrl) def test_create_save_request_archives_missing_artifacts_data( api_client, keycloak_oidc, swh_scheduler ): swh_scheduler.add_load_archive_task_type() keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") originUrl = "https://somewhere.org/simple" url = reverse( "api-1-save-origin", url_args={ "visit_type": "archives", "origin_url": originUrl, }, ) response = check_api_post_response( api_client, url, status_code=400, data={}, ) assert "Artifacts data are missing" in response.data["reason"] response = check_api_post_response( api_client, url, status_code=400, data={"archives_data": [{"artifact_url": "", "arttifact_version": "1.0"}]}, ) assert "Missing url or version for an artifact to load" in response.data["reason"] def test_create_save_request_archives_accepted_ambassador_user( api_client, origin_to_review, keycloak_oidc, mocker, swh_scheduler ): keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") check_created_save_request_status( api_client, mocker, origin_to_review, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) assert SaveAuthorizedOrigin.objects.get(url=origin_to_review) def test_create_save_request_anonymous_user_no_user_id(api_client, swh_scheduler): origin_url = "https://some.git.hosters/user/repo" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, ) check_api_post_responses(api_client, url, status_code=200) sor = SaveOriginRequest.objects.get(origin_url=origin_url) assert sor.user_ids is None def test_create_save_request_authenticated_user_id( api_client, keycloak_oidc, swh_scheduler ): oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") origin_url = "https://some.git.hosters/user/repo2" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, ) response = check_api_post_response(api_client, url, status_code=200) assert response.wsgi_request.user.id is not None user_id = str(response.wsgi_request.user.id) sor = SaveOriginRequest.objects.get(user_ids=f'"{user_id}"') assert sor.user_ids == f'"{user_id}"' def test_create_pending_save_request_multiple_authenticated_users( api_client, swh_scheduler, regular_user, regular_user2 ): origin_url = "https://some.git.hosters/user/repo3" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, ) api_client.force_login(regular_user) check_api_post_response(api_client, url, status_code=200) api_client.force_login(regular_user2) check_api_post_response(api_client, url, status_code=200) assert SaveOriginRequest.objects.get(user_ids__contains=f'"{regular_user.id}"') assert SaveOriginRequest.objects.get(user_ids__contains=f'"{regular_user2.id}"') def test_reject_origin_url_with_password(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={ "visit_type": "git", "origin_url": "https://user:password@git.example.org/user/repo", }, ) resp = check_api_post_responses(api_client, url, status_code=400) assert resp.data == { "exception": "BadInputExc", "reason": ( "The provided origin url contains a password and cannot " "be accepted for security reasons." ), } def test_accept_origin_url_with_username_but_without_password( api_client, swh_scheduler ): url = reverse( "api-1-save-origin", url_args={ "visit_type": "git", "origin_url": "https://user@git.example.org/user/repo", }, ) check_api_post_responses(api_client, url, status_code=200) @pytest.mark.parametrize( "origin_url", [ "https://anonymous:anonymous@git.example.org/user/repo", "https://anonymous:@git.example.org/user/repo", ], ) def test_accept_origin_url_with_anonymous_credentials( api_client, swh_scheduler, origin_url ): url = reverse( "api-1-save-origin", url_args={ "visit_type": "git", "origin_url": origin_url, }, ) check_api_post_responses(api_client, url, status_code=200) diff --git a/swh/web/save_origin_webhooks/generic_receiver.py b/swh/web/save_origin_webhooks/generic_receiver.py index fa63f11c..88e8ee70 100644 --- a/swh/web/save_origin_webhooks/generic_receiver.py +++ b/swh/web/save_origin_webhooks/generic_receiver.py @@ -1,129 +1,136 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import abc from typing import Any, Dict, Tuple from rest_framework.request import Request from swh.web.api.apidoc import api_doc from swh.web.api.apiurls import APIUrls, api_route from swh.web.save_code_now.origin_save import create_save_origin_request from swh.web.utils.exc import BadInputExc webhooks_api_urls = APIUrls() +SUPPORTED_FORGE_TYPES = set() + + class OriginSaveWebhookReceiver(abc.ABC): FORGE_TYPE: str WEBHOOK_GUIDE_URL: str REPO_TYPES: str @abc.abstractmethod def is_forge_request(self, request: Request) -> bool: ... def is_ping_event(self, request: Request) -> bool: return False @abc.abstractmethod def is_push_event(self, request: Request) -> bool: ... @abc.abstractmethod def extract_repo_info(self, request: Request) -> Tuple[str, str, bool]: """Extract and return a tuple (repository_url, visit_type, private) from the forge webhook payload.""" ... def __init__(self): self.__doc__ = f""" .. http:post:: /api/1/origin/save/webhook/{self.FORGE_TYPE.lower()}/ Webhook receiver for {self.FORGE_TYPE} to request or update the archival of a repository when new commits are pushed to it. To add such webhook to one of your {self.REPO_TYPES} repository hosted on {self.FORGE_TYPE}, please follow `{self.FORGE_TYPE}'s webhooks guide <{self.WEBHOOK_GUIDE_URL}>`_. The expected content type for the webhook payload must be ``application/json``. :>json string origin_url: the url of the origin to save :>json string visit_type: the type of visit to perform :>json string save_request_date: the date (in iso format) the save request was issued :>json string save_request_status: the status of the save request, either **accepted**, **rejected** or **pending** :statuscode 200: save request for repository has been successfully created from the webhook payload. :statuscode 400: no save request has been created due to invalid POST request or missing data in webhook payload """ self.__name__ = "api_origin_save_webhook_{self.FORGE_TYPE.lower()}" + SUPPORTED_FORGE_TYPES.add(self.FORGE_TYPE.lower()) api_doc( f"/origin/save/webhook/{self.FORGE_TYPE.lower()}/", category="Request archival", )(self) api_route( f"/origin/save/webhook/{self.FORGE_TYPE.lower()}/", f"api-1-origin-save-webhook-{self.FORGE_TYPE.lower()}", methods=["POST"], api_urls=webhooks_api_urls, )(self) def __call__( self, request: Request, ) -> Dict[str, Any]: if not self.is_forge_request(request): raise BadInputExc( f"POST request was not sent by a {self.FORGE_TYPE} webhook and " "has not been processed." ) if self.is_ping_event(request): return {"message": "pong"} if not self.is_push_event(request): raise BadInputExc( f"Event sent by {self.FORGE_TYPE} webhook is not a push one, request " "has not been processed." ) content_type = request.headers.get("Content-Type") if content_type and not content_type.startswith("application/json"): raise BadInputExc( f"Invalid content type '{content_type}' for the POST request sent by " f"{self.FORGE_TYPE} webhook, it should be 'application/json'." ) repo_url, visit_type, private = self.extract_repo_info(request) if not repo_url: raise BadInputExc( f"Repository URL could not be extracted from {self.FORGE_TYPE} webhook " f"payload." ) if not visit_type: raise BadInputExc( f"Visit type could not be determined for repository {repo_url}." ) if private: raise BadInputExc( f"Repository {repo_url} is private and cannot be cloned without authentication." ) save_request = create_save_origin_request( - visit_type=visit_type, origin_url=repo_url + visit_type=visit_type, + origin_url=repo_url, + from_webhook=True, + webhook_origin=self.FORGE_TYPE.lower(), ) return { "origin_url": save_request["origin_url"], "visit_type": save_request["visit_type"], "save_request_date": save_request["save_request_date"], "save_request_status": save_request["save_request_status"], } diff --git a/swh/web/save_origin_webhooks/tests/utils.py b/swh/web/save_origin_webhooks/tests/utils.py index 570c1af4..7b564e87 100644 --- a/swh/web/save_origin_webhooks/tests/utils.py +++ b/swh/web/save_origin_webhooks/tests/utils.py @@ -1,169 +1,175 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict +from swh.web.save_code_now.models import SaveOriginRequest from swh.web.tests.helpers import check_api_post_responses from swh.web.utils import reverse def django_http_headers(http_headers: Dict[str, Any]): return {f"HTTP_{k.upper().replace('-', '_')}": v for k, v in http_headers.items()} def origin_save_webhook_receiver_test( forge_type: str, http_headers: Dict[str, Any], payload: Dict[str, Any], expected_origin_url: str, expected_visit_type: str, api_client, swh_scheduler, ): url = reverse(f"api-1-origin-save-webhook-{forge_type.lower()}") resp = check_api_post_responses( api_client, url, status_code=200, data=payload, **django_http_headers(http_headers), ) assert resp.data["origin_url"] == expected_origin_url assert resp.data["visit_type"] == expected_visit_type tasks = swh_scheduler.search_tasks(task_type=f"load-{expected_visit_type}") assert tasks task = dict(tasks[0].items()) assert task["arguments"]["kwargs"]["url"] == expected_origin_url + request = SaveOriginRequest.objects.get( + origin_url=expected_origin_url, visit_type=expected_visit_type + ) + assert request.from_webhook + def origin_save_webhook_receiver_invalid_request_test( forge_type: str, http_headers: Dict[str, Any], payload: Dict[str, Any], api_client, ): url = reverse(f"api-1-origin-save-webhook-{forge_type.lower()}") resp = check_api_post_responses( api_client, url, status_code=400, data=payload, **django_http_headers(http_headers), ) assert resp.data == { "exception": "BadInputExc", "reason": ( f"POST request was not sent by a {forge_type} webhook " "and has not been processed." ), } def origin_save_webhook_receiver_invalid_event_test( forge_type: str, http_headers: Dict[str, Any], payload: Dict[str, Any], api_client, ): url = reverse(f"api-1-origin-save-webhook-{forge_type.lower()}") resp = check_api_post_responses( api_client, url, status_code=400, data=payload, **django_http_headers(http_headers), ) assert resp.data == { "exception": "BadInputExc", "reason": ( f"Event sent by {forge_type} webhook is not a push one, request has " "not been processed." ), } def origin_save_webhook_receiver_invalid_content_type_test( forge_type: str, http_headers: Dict[str, Any], payload: Dict[str, Any], api_client, ): url = reverse(f"api-1-origin-save-webhook-{forge_type.lower()}") bad_content_type = "application/x-www-form-urlencoded" http_headers["Content-Type"] = bad_content_type resp = check_api_post_responses( api_client, url, status_code=400, data=payload, **django_http_headers(http_headers), ) assert resp.data == { "exception": "BadInputExc", "reason": ( f"Invalid content type '{bad_content_type}' for the POST request sent by " f"{forge_type} webhook, it should be 'application/json'." ), } def origin_save_webhook_receiver_no_repo_url_test( forge_type: str, http_headers: Dict[str, Any], payload: Dict[str, Any], api_client, ): url = reverse(f"api-1-origin-save-webhook-{forge_type.lower()}") resp = check_api_post_responses( api_client, url, status_code=400, data=payload, **django_http_headers(http_headers), ) assert resp.data == { "exception": "BadInputExc", "reason": ( f"Repository URL could not be extracted from {forge_type} webhook payload." ), } def origin_save_webhook_receiver_private_repo_test( forge_type: str, http_headers: Dict[str, Any], payload: Dict[str, Any], api_client, expected_origin_url: str, ): url = reverse(f"api-1-origin-save-webhook-{forge_type.lower()}") resp = check_api_post_responses( api_client, url, status_code=400, data=payload, **django_http_headers(http_headers), ) assert resp.data == { "exception": "BadInputExc", "reason": ( f"Repository {expected_origin_url} is private and cannot be cloned " "without authentication." ), } diff --git a/swh/web/utils/typing.py b/swh/web/utils/typing.py index b5d1a4f6..6f7771e4 100644 --- a/swh/web/utils/typing.py +++ b/swh/web/utils/typing.py @@ -1,261 +1,265 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, List, Optional, TypeVar from typing_extensions import TypedDict from swh.core.api.classes import PagedResult as CorePagedResult from swh.model.swhids import ObjectType class OriginInfo(TypedDict): url: str """URL of the origin""" class OriginMetadataInfo(TypedDict): url: str """URL of the origin""" metadata: Dict[str, Any] """Origin metadata associated to the origin""" class OriginVisitInfo(TypedDict): date: str """date of the visit in iso format""" formatted_date: str """formatted date of the visit""" metadata: Dict[str, Any] """metadata associated to the visit""" origin: str """visited origin URL""" snapshot: str """snapshot identifier computed during the visit""" status: str """status of the visit ("ongoing", "full" or "partial") """ type: str """visit type (git, hg, debian, ...)""" url: str """URL to browse the snapshot""" visit: int """visit identifier""" class SnapshotBranchInfo(TypedDict): date: Optional[str] """"author date of branch heading revision""" directory: Optional[str] """directory associated to branch heading revision""" message: Optional[str] """message of branch heading revision""" name: str """branch name""" alias: bool """define if the branch is an alias""" target_type: str """branch target type: content, directory or revision""" target: str """branch target id""" url: Optional[str] """optional browse URL (content, directory, ...) scoped to branch""" class SnapshotReleaseInfo(TypedDict): branch_name: str """branch name associated to release in snapshot""" date: str """release date""" directory: Optional[str] """optional directory associated to the release""" id: str """release identifier""" message: str """release message""" name: str """release name""" alias: bool """define if the branch is an alias""" target: str """release target""" target_type: str """release target_type""" url: Optional[str] """optional browse URL (content, directory, ...) scoped to release""" class SnapshotContext(TypedDict): branch: Optional[str] """optional branch name set when browsing snapshot in that scope""" branch_alias: bool """indicates if the focused branch is an alias""" branches: List[SnapshotBranchInfo] """list of snapshot branches (possibly truncated)""" branches_url: str """snapshot branches list browse URL""" is_empty: bool """indicates if the snapshot is empty""" origin_info: Optional[OriginInfo] """optional origin info associated to the snapshot""" origin_visits_url: Optional[str] """optional origin visits URL""" query_params: Dict[str, Optional[str]] """common query parameters when browsing snapshot content""" release: Optional[str] """optional release name set when browsing snapshot in that scope""" release_alias: bool """indicates if the focused release is an alias""" release_id: Optional[str] """optional release identifier set when browsing snapshot in that scope""" releases: List[SnapshotReleaseInfo] """list of snapshot releases (possibly truncated)""" releases_url: str """snapshot releases list browse URL""" revision_id: Optional[str] """optional revision identifier set when browsing snapshot in that scope""" revision_info: Optional[Dict[str, Any]] """optional revision info set when browsing snapshot in that scope""" root_directory: Optional[str] """optional root directory identifier set when browsing snapshot content""" snapshot_id: str """snapshot identifier""" snapshot_sizes: Dict[str, int] """snapshot sizes grouped by branch target type""" snapshot_swhid: str """snapshot SWHID""" url_args: Dict[str, Any] """common URL arguments when browsing snapshot content""" visit_info: Optional[OriginVisitInfo] """optional origin visit info associated to the snapshot""" browse_url: Optional[str] """optional browse URL associated to the snapshot""" class SWHObjectInfo(TypedDict): object_type: ObjectType object_id: Optional[str] class SWHIDContext(TypedDict, total=False): origin: str anchor: str visit: str path: str lines: str class SWHIDInfo(SWHObjectInfo): swhid: str swhid_url: str context: SWHIDContext swhid_with_context: Optional[str] swhid_with_context_url: Optional[str] class SWHObjectInfoMetadata(TypedDict, total=False): origin_url: Optional[str] visit_date: Optional[str] visit_type: Optional[str] class ContentMetadata(SWHObjectInfo, SWHObjectInfoMetadata): sha1: str sha1_git: str sha256: str blake2s256: str content_url: str mimetype: str encoding: str size: int language: str path: Optional[str] filename: Optional[str] directory: Optional[str] root_directory: Optional[str] revision: Optional[str] release: Optional[str] snapshot: Optional[str] class DirectoryMetadata(SWHObjectInfo, SWHObjectInfoMetadata): directory: Optional[str] nb_files: Optional[int] nb_dirs: Optional[int] sum_file_sizes: Optional[int] root_directory: Optional[str] path: Optional[str] revision: Optional[str] revision_found: Optional[bool] release: Optional[str] snapshot: Optional[str] class ReleaseMetadata(SWHObjectInfo, SWHObjectInfoMetadata): release: str author: str author_url: str date: str name: str synthetic: bool target: str target_type: str snapshot: Optional[str] class RevisionMetadata(SWHObjectInfo, SWHObjectInfoMetadata): revision: str author: str author_url: str committer: str committer_url: str date: str committer_date: str directory: str merge: bool metadata: str parents: List[str] synthetic: bool type: str snapshot: Optional[str] TResult = TypeVar("TResult") PagedResult = CorePagedResult[TResult, str] class SaveOriginRequestInfo(TypedDict, total=False): id: int """Unique key""" save_request_date: str """Date of the creation request""" visit_type: str """Type of the visit""" visit_status: Optional[str] """Status of the visit""" origin_url: str """Origin to ingest""" save_request_status: str """Status of the request""" loading_task_id: Optional[int] """Identifier of the loading task in the scheduler if scheduled""" visit_date: Optional[str] """End of the visit if terminated""" save_task_status: str """Status of the scheduled task""" note: Optional[str] """Optional note associated to the request, for instance rejection reason""" + from_webhook: bool + """Indicates if request was created from a webhook receiver""" + webhook_origin: Optional[str] + """Indicates from which forge type a webhook was received""" class OriginExistenceCheckInfo(TypedDict): origin_url: str """Origin to check""" exists: bool """Does the url exist?""" content_length: Optional[int] """content length of the artifact""" last_modified: Optional[str] """Last modification time reported by the server (as iso8601 string)"""