diff --git a/swh/web/add_forge_now/migrations/0004_rename_tables.py b/swh/web/add_forge_now/migrations/0004_rename_tables.py index 0c02304f..a020462c 100644 --- a/swh/web/add_forge_now/migrations/0004_rename_tables.py +++ b/swh/web/add_forge_now/migrations/0004_rename_tables.py @@ -1,17 +1,21 @@ # Generated by Django 2.2.27 on 2022-03-29 11:42 from django.db import migrations class Migration(migrations.Migration): dependencies = [ ("swh_web_add_forge_now", "0003_request_submitter_forward_username"), ] operations = [ - migrations.AlterModelTable(name="request", table="add_forge_request",), migrations.AlterModelTable( - name="requesthistory", table="add_forge_request_history", + name="request", + table="add_forge_request", + ), + migrations.AlterModelTable( + name="requesthistory", + table="add_forge_request_history", ), ] diff --git a/swh/web/add_forge_now/models.py b/swh/web/add_forge_now/models.py index 62c5baf3..1e714ada 100644 --- a/swh/web/add_forge_now/models.py +++ b/swh/web/add_forge_now/models.py @@ -1,110 +1,112 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations import enum from typing import List from django.db import models from .apps import APP_LABEL class RequestStatus(enum.Enum): """Request statuses. Values are used in the ui. """ PENDING = "Pending" WAITING_FOR_FEEDBACK = "Waiting for feedback" FEEDBACK_TO_HANDLE = "Feedback to handle" ACCEPTED = "Accepted" SCHEDULED = "Scheduled" FIRST_LISTING_DONE = "First listing done" FIRST_ORIGIN_LOADED = "First origin loaded" REJECTED = "Rejected" SUSPENDED = "Suspended" DENIED = "Denied" @classmethod def choices(cls): return tuple((variant.name, variant.value) for variant in cls) def allowed_next_statuses(self) -> List[RequestStatus]: next_statuses = { self.PENDING: [self.WAITING_FOR_FEEDBACK, self.REJECTED, self.SUSPENDED], self.WAITING_FOR_FEEDBACK: [self.FEEDBACK_TO_HANDLE], self.FEEDBACK_TO_HANDLE: [ self.WAITING_FOR_FEEDBACK, self.ACCEPTED, self.REJECTED, self.SUSPENDED, ], self.ACCEPTED: [self.SCHEDULED], self.SCHEDULED: [ self.FIRST_LISTING_DONE, # in case of race condition between lister and loader: self.FIRST_ORIGIN_LOADED, ], self.FIRST_LISTING_DONE: [self.FIRST_ORIGIN_LOADED], self.FIRST_ORIGIN_LOADED: [], self.REJECTED: [], self.SUSPENDED: [self.PENDING], self.DENIED: [], } return next_statuses[self] # type: ignore class RequestActorRole(enum.Enum): MODERATOR = "moderator" SUBMITTER = "submitter" FORGE_ADMIN = "forge admin" @classmethod def choices(cls): return tuple((variant.name, variant.value) for variant in cls) class RequestHistory(models.Model): """Comment or status change. This is commented or changed by either submitter or moderator. """ request = models.ForeignKey("Request", models.DO_NOTHING) text = models.TextField() actor = models.TextField() actor_role = models.TextField(choices=RequestActorRole.choices()) date = models.DateTimeField(auto_now_add=True) new_status = models.TextField(choices=RequestStatus.choices(), null=True) class Meta: app_label = APP_LABEL db_table = "add_forge_request_history" class Request(models.Model): status = models.TextField( - choices=RequestStatus.choices(), default=RequestStatus.PENDING.name, + choices=RequestStatus.choices(), + default=RequestStatus.PENDING.name, ) submission_date = models.DateTimeField(auto_now_add=True) submitter_name = models.TextField() submitter_email = models.TextField() submitter_forward_username = models.BooleanField(default=False) # FIXME: shall we do create a user model inside the webapp instead? forge_type = models.TextField() forge_url = models.TextField() forge_contact_email = models.EmailField() forge_contact_name = models.TextField() forge_contact_comment = models.TextField( - null=True, help_text="Where did you find this contact information (url, ...)", + null=True, + help_text="Where did you find this contact information (url, ...)", ) class Meta: app_label = APP_LABEL db_table = "add_forge_request" diff --git a/swh/web/add_forge_now/views.py b/swh/web/add_forge_now/views.py index eb18ae6d..0566d070 100644 --- a/swh/web/add_forge_now/views.py +++ b/swh/web/add_forge_now/views.py @@ -1,125 +1,127 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, List from django.conf.urls import url from django.core.paginator import Paginator from django.db.models import Q from django.http.request import HttpRequest from django.http.response import HttpResponse, JsonResponse from django.shortcuts import render from swh.web.add_forge_now.models import Request as AddForgeRequest from swh.web.api.views.add_forge_now import ( AddForgeNowRequestPublicSerializer, AddForgeNowRequestSerializer, ) from swh.web.common.utils import has_add_forge_now_permission def add_forge_request_list_datatables(request: HttpRequest) -> HttpResponse: """Dedicated endpoint used by datatables to display the add-forge requests in the Web UI. """ draw = int(request.GET.get("draw", 0)) add_forge_requests = AddForgeRequest.objects.all() table_data: Dict[str, Any] = { "recordsTotal": add_forge_requests.count(), "draw": draw, } search_value = request.GET.get("search[value]") column_order = request.GET.get("order[0][column]") field_order = request.GET.get(f"columns[{column_order}][name]", "id") order_dir = request.GET.get("order[0][dir]", "desc") if field_order: if order_dir == "desc": field_order = "-" + field_order add_forge_requests = add_forge_requests.order_by(field_order) per_page = int(request.GET.get("length", 10)) page_num = int(request.GET.get("start", 0)) // per_page + 1 if search_value: add_forge_requests = add_forge_requests.filter( Q(forge_type__icontains=search_value) | Q(forge_url__icontains=search_value) | Q(status__icontains=search_value) ) if ( int(request.GET.get("user_requests_only", "0")) and request.user.is_authenticated ): add_forge_requests = add_forge_requests.filter( submitter_name=request.user.username ) paginator = Paginator(add_forge_requests, per_page) page = paginator.page(page_num) if has_add_forge_now_permission(request.user): requests = AddForgeNowRequestSerializer(page.object_list, many=True).data else: requests = AddForgeNowRequestPublicSerializer(page.object_list, many=True).data results = [dict(request) for request in requests] table_data["recordsFiltered"] = add_forge_requests.count() table_data["data"] = results return JsonResponse(table_data) FORGE_TYPES: List[str] = [ "bitbucket", "cgit", "gitlab", "gitea", "heptapod", ] def create_request_create(request): - """View to create a new 'add_forge_now' request. - - """ + """View to create a new 'add_forge_now' request.""" return render( - request, "add_forge_now/creation_form.html", {"forge_types": FORGE_TYPES}, + request, + "add_forge_now/creation_form.html", + {"forge_types": FORGE_TYPES}, ) def create_request_list(request): - """View to list existing 'add_forge_now' requests. + """View to list existing 'add_forge_now' requests.""" - """ - - return render(request, "add_forge_now/list.html",) + return render( + request, + "add_forge_now/list.html", + ) def create_request_help(request): - """View to explain 'add_forge_now'. - - """ + """View to explain 'add_forge_now'.""" - return render(request, "add_forge_now/help.html",) + return render( + request, + "add_forge_now/help.html", + ) urlpatterns = [ url( r"^add-forge/request/list/datatables/$", add_forge_request_list_datatables, name="add-forge-request-list-datatables", ), url(r"^add-forge/request/create/$", create_request_create, name="forge-add-create"), url(r"^add-forge/request/list/$", create_request_list, name="forge-add-list"), url(r"^add-forge/request/help/$", create_request_help, name="forge-add-help"), ] diff --git a/swh/web/admin/add_forge_now.py b/swh/web/admin/add_forge_now.py index d389cfcb..aa21ca6a 100644 --- a/swh/web/admin/add_forge_now.py +++ b/swh/web/admin/add_forge_now.py @@ -1,42 +1,39 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.conf import settings from django.contrib.auth.decorators import user_passes_test from django.shortcuts import render from swh.web.admin.adminurls import admin_route from swh.web.common.utils import has_add_forge_now_permission @admin_route( - r"add-forge/requests/", view_name="add-forge-now-requests-moderation", + r"add-forge/requests/", + view_name="add-forge-now-requests-moderation", ) @user_passes_test(has_add_forge_now_permission, login_url=settings.LOGIN_URL) def add_forge_now_requests_moderation_dashboard(request): - """Moderation dashboard to allow listing current requests. - - """ + """Moderation dashboard to allow listing current requests.""" return render( request, "add_forge_now/requests-moderation.html", {"heading": "Add forge now requests moderation"}, ) @admin_route( r"add-forge/request/(?P<request_id>(\d)+)/", view_name="add-forge-now-request-dashboard", ) @user_passes_test(has_add_forge_now_permission, login_url=settings.LOGIN_URL) def add_forge_now_request_dashboard(request, request_id): - """Moderation dashboard to allow listing current requests. - - """ + """Moderation dashboard to allow listing current requests.""" return render( request, "add_forge_now/request-dashboard.html", {"request_id": request_id, "heading": "Add forge now request dashboard"}, ) diff --git a/swh/web/api/apidoc.py b/swh/web/api/apidoc.py index 26ed9338..85a4f0a2 100644 --- a/swh/web/api/apidoc.py +++ b/swh/web/api/apidoc.py @@ -1,468 +1,471 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import functools from functools import wraps import os import re import textwrap from typing import List import docutils.nodes import docutils.parsers.rst import docutils.utils from rest_framework.decorators import api_view from swh.web.api.apiresponse import make_api_response from swh.web.api.apiurls import APIUrls from swh.web.common.utils import parse_rst class _HTTPDomainDocVisitor(docutils.nodes.NodeVisitor): """ docutils visitor for walking on a parsed docutils document containing sphinx httpdomain roles. Its purpose is to extract relevant info regarding swh api endpoints (for instance url arguments) from their docstring written using sphinx httpdomain; and produce the main description back into a ReST string """ # httpdomain roles we want to parse (based on sphinxcontrib.httpdomain 1.6) parameter_roles = ("param", "parameter", "arg", "argument") request_json_object_roles = ("reqjsonobj", "reqjson", "<jsonobj", "<json") request_json_array_roles = ("reqjsonarr", "<jsonarr") response_json_object_roles = ("resjsonobj", "resjson", ">jsonobj", ">json") response_json_array_roles = ("resjsonarr", ">jsonarr") query_parameter_roles = ("queryparameter", "queryparam", "qparam", "query") request_header_roles = ("<header", "reqheader", "requestheader") response_header_roles = (">header", "resheader", "responseheader") status_code_roles = ("statuscode", "status", "code") def __init__(self, document, data): super().__init__(document) self.data = data self.args_set = set() self.params_set = set() self.inputs_set = set() self.returns_set = set() self.status_codes_set = set() self.reqheaders_set = set() self.resheaders_set = set() self.current_json_obj = None self.current_field_name = None def _default_visit(self, node: docutils.nodes.Element) -> str: """Simply visits a text node, drops its start and end tags, visits the children, and concatenates their results.""" return "".join(map(self.dispatch_visit, node.children)) def visit_emphasis(self, node: docutils.nodes.emphasis) -> str: return f"*{self._default_visit(node)}*" def visit_strong(self, node: docutils.nodes.emphasis) -> str: return f"**{self._default_visit(node)}**" def visit_reference(self, node: docutils.nodes.reference) -> str: text = self._default_visit(node) refuri = node.attributes.get("refuri") if refuri is not None: return f"`{text} <{refuri}>`__" else: return f"`{text}`_" def visit_target(self, node: docutils.nodes.reference) -> str: parts = ["\n"] parts.extend( f".. _{name}: {node.attributes['refuri']}" for name in node.attributes["names"] ) return "\n".join(parts) def visit_literal(self, node: docutils.nodes.literal) -> str: return f"``{self._default_visit(node)}``" def visit_field_name(self, node: docutils.nodes.field_name) -> str: self.current_field_name = node.astext() return "" def visit_field_body(self, node: docutils.nodes.field_body) -> str: text = self._default_visit(node).strip() assert text, str(node) field_data = self.current_field_name.split(" ") # Parameters if field_data[0] in self.parameter_roles: if field_data[2] not in self.args_set: self.data["args"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.args_set.add(field_data[2]) # Query Parameters if field_data[0] in self.query_parameter_roles: if field_data[2] not in self.params_set: self.data["params"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.params_set.add(field_data[2]) # Request data type if ( field_data[0] in self.request_json_array_roles or field_data[0] in self.request_json_object_roles ): # array if field_data[0] in self.request_json_array_roles: self.data["input_type"] = "array" # object else: self.data["input_type"] = "object" # input object field if field_data[2] not in self.inputs_set: self.data["inputs"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.inputs_set.add(field_data[2]) self.current_json_obj = self.data["inputs"][-1] # Response type if ( field_data[0] in self.response_json_array_roles or field_data[0] in self.response_json_object_roles ): # array if field_data[0] in self.response_json_array_roles: self.data["return_type"] = "array" # object else: self.data["return_type"] = "object" # returned object field if field_data[2] not in self.returns_set: self.data["returns"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.returns_set.add(field_data[2]) self.current_json_obj = self.data["returns"][-1] # Status Codes if field_data[0] in self.status_code_roles: if field_data[1] not in self.status_codes_set: self.data["status_codes"].append({"code": field_data[1], "doc": text}) self.status_codes_set.add(field_data[1]) # Request Headers if field_data[0] in self.request_header_roles: if field_data[1] not in self.reqheaders_set: self.data["reqheaders"].append({"name": field_data[1], "doc": text}) self.reqheaders_set.add(field_data[1]) # Response Headers if field_data[0] in self.response_header_roles: if field_data[1] not in self.resheaders_set: resheader = {"name": field_data[1], "doc": text} self.data["resheaders"].append(resheader) self.resheaders_set.add(field_data[1]) if ( resheader["name"] == "Content-Type" and resheader["doc"] == "application/octet-stream" ): self.data["return_type"] = "octet stream" # Don't return anything in the description; these nodes only add text # to other fields return "" # We ignore these nodes and handle their subtrees directly in # visit_field_name and visit_field_body visit_field = visit_field_list = _default_visit def visit_paragraph(self, node: docutils.nodes.paragraph) -> str: """ Visit relevant paragraphs to parse """ # only parsed top level paragraphs text = self._default_visit(node) return "\n\n" + text def visit_literal_block(self, node: docutils.nodes.literal_block) -> str: """ Visit literal blocks """ text = node.astext() return f"\n\n::\n\n{textwrap.indent(text, ' ')}\n" def visit_bullet_list(self, node: docutils.nodes.bullet_list) -> str: parts = ["\n\n"] for child in node.traverse(): # process list item if isinstance(child, docutils.nodes.paragraph): line_text = self.dispatch_visit(child) parts.append("\t* %s\n" % textwrap.indent(line_text, "\t ").strip()) return "".join(parts) # visit_bullet_list collects and handles this with a more global view: visit_list_item = _default_visit def visit_warning(self, node: docutils.nodes.warning) -> str: text = self._default_visit(node) return "\n\n.. warning::\n%s\n" % textwrap.indent(text, "\t") def visit_Text(self, node: docutils.nodes.Text) -> str: """Leaf node""" return str(node).replace("\n", " ") # Prettier in generated HTML def visit_problematic(self, node: docutils.nodes.problematic) -> str: # api urls cleanup to generate valid links afterwards text = self._default_visit(node) subs_made = 1 while subs_made: (text, subs_made) = re.subn(r"(:http:.*)(\(\w+\))", r"\1", text) subs_made = 1 while subs_made: (text, subs_made) = re.subn(r"(:http:.*)(\[.*\])", r"\1", text) text = re.sub(r"([^:])//", r"\1/", text) # transform references to api endpoints doc into valid rst links text = re.sub(":http:get:`([^,`]*)`", r"`\1 <\1doc/>`_", text) # transform references to some elements into bold text text = re.sub(":http:header:`(.*)`", r"**\1**", text) text = re.sub(":func:`(.*)`", r"**\1**", text) # extract example urls if ":swh_web_api:" in text: # Extract examples to their own section examples_str = re.sub(":swh_web_api:`(.+)`.*", r"/api/1/\1", text) self.data["examples"] += examples_str.split("\n") return text def visit_block_quote(self, node: docutils.nodes.block_quote) -> str: return self._default_visit(node) return ( f".. code-block::\n" f"{textwrap.indent(self._default_visit(node), ' ')}\n" ) def visit_title_reference(self, node: docutils.nodes.title_reference) -> str: text = self._default_visit(node) raise Exception( f"Unexpected title reference. " f"Possible cause: you used `{text}` instead of ``{text}``" ) def visit_document(self, node: docutils.nodes.document) -> None: text = self._default_visit(node) # Strip examples; they are displayed separately text = re.split("\n\\*\\*Examples?:\\*\\*\n", text)[0] self.data["description"] = text.strip() def visit_system_message(self, node): return "" def unknown_visit(self, node) -> str: raise NotImplementedError( f"Unknown node type: {node.__class__.__name__}. Value: {node}" ) def unknown_departure(self, node): pass def _parse_httpdomain_doc(doc, data): doc_lines = doc.split("\n") doc_lines_filtered = [] urls = defaultdict(list) default_http_methods = ["HEAD", "OPTIONS"] # httpdomain is a sphinx extension that is unknown to docutils but # fortunately we can still parse its directives' content, # so remove lines with httpdomain directives before executing the # rst parser from docutils for doc_line in doc_lines: if ".. http" not in doc_line: doc_lines_filtered.append(doc_line) else: url = doc_line[doc_line.find("/") :] # emphasize url arguments for html rendering url = re.sub(r"\((\w+)\)", r" **\(\1\)** ", url) method = re.search(r"http:(\w+)::", doc_line).group(1) urls[url].append(method.upper()) for url, methods in urls.items(): data["urls"].append({"rule": url, "methods": methods + default_http_methods}) # parse the rst docstring and do not print system messages about # unknown httpdomain roles document = parse_rst("\n".join(doc_lines_filtered), report_level=5) # remove the system_message nodes from the parsed document for node in document.traverse(docutils.nodes.system_message): node.parent.remove(node) # visit the document nodes to extract relevant endpoint info visitor = _HTTPDomainDocVisitor(document, data) document.walkabout(visitor) class APIDocException(Exception): """ Custom exception to signal errors in the use of the APIDoc decorators """ def api_doc( - route: str, noargs: bool = False, tags: List[str] = [], api_version: str = "1", + route: str, + noargs: bool = False, + tags: List[str] = [], + api_version: str = "1", ): """ Decorator for an API endpoint implementation used to generate a dedicated view displaying its HTML documentation. The documentation will be generated from the endpoint docstring based on sphinxcontrib-httpdomain format. Args: route: documentation page's route noargs: set to True if the route has no arguments, and its result should be displayed anytime its documentation is requested. Default to False tags: Further information on api endpoints. Two values are possibly expected: * hidden: remove the entry points from the listing * upcoming: display the entry point but it is not followable * deprecated: display the entry point as deprecated in the index api_version: api version string """ tags_set = set(tags) # @api_doc() Decorator call def decorator(f): # if the route is not hidden, add it to the index if "hidden" not in tags_set: doc_data = get_doc_data(f, route, noargs) doc_desc = doc_data["description"] APIUrls.add_doc_route( route, re.split(r"\.\s", doc_desc)[0], noargs=noargs, api_version=api_version, tags=tags_set, ) # create a dedicated view to display endpoint HTML doc @api_view(["GET", "HEAD"]) @wraps(f) def doc_view(request): doc_data = get_doc_data(f, route, noargs) return make_api_response(request, None, doc_data) route_name = "%s-doc" % route[1:-1].replace("/", "-") urlpattern = f"^{api_version}{route}doc/$" view_name = "api-%s-%s" % (api_version, route_name) APIUrls.add_url_pattern(urlpattern, doc_view, view_name) @wraps(f) def documented_view(request, **kwargs): doc_data = get_doc_data(f, route, noargs) try: return {"data": f(request, **kwargs), "doc_data": doc_data} except Exception as exc: exc.doc_data = doc_data raise exc return documented_view return decorator @functools.lru_cache(maxsize=32) def get_doc_data(f, route, noargs): """ Build documentation data for the decorated api endpoint function """ data = { "description": "", "response_data": None, "urls": [], "args": [], "params": [], "input_type": "", "inputs": [], "resheaders": [], "reqheaders": [], "return_type": "", "returns": [], "status_codes": [], "examples": [], "route": route, "noargs": noargs, } if not f.__doc__: raise APIDocException( "apidoc: expected a docstring" " for function %s" % (f.__name__,) ) # use raw docstring as endpoint documentation if sphinx # httpdomain is not used if ".. http" not in f.__doc__: data["description"] = f.__doc__ # else parse the sphinx httpdomain docstring with docutils # (except when building the swh-web documentation through autodoc # sphinx extension, not needed and raise errors with sphinx >= 1.7) elif "SWH_DOC_BUILD" not in os.environ: _parse_httpdomain_doc(f.__doc__, data) # process input/returned object info for nicer html display inputs_list = "" returns_list = "" for inp in data["inputs"]: # special case for array of non object type, for instance # :<jsonarr string -: an array of string if inp["name"] != "-": inputs_list += "\t* **%s (%s)**: %s\n" % ( inp["name"], inp["type"], textwrap.indent(inp["doc"], "\t "), ) for ret in data["returns"]: # special case for array of non object type, for instance # :>jsonarr string -: an array of string if ret["name"] != "-": returns_list += "\t* **%s (%s)**: %s\n" % ( ret["name"], ret["type"], textwrap.indent(ret["doc"], "\t "), ) data["inputs_list"] = inputs_list data["returns_list"] = returns_list return data DOC_COMMON_HEADERS = """ :reqheader Accept: the requested response content type, either ``application/json`` (default) or ``application/yaml`` :resheader Content-Type: this depends on :http:header:`Accept` header of request""" DOC_RESHEADER_LINK = """ :resheader Link: indicates that a subsequent result page is available and contains the url pointing to it """ DEFAULT_SUBSTITUTIONS = { "common_headers": DOC_COMMON_HEADERS, "resheader_link": DOC_RESHEADER_LINK, } def format_docstring(**substitutions): def decorator(f): f.__doc__ = f.__doc__.format(**{**DEFAULT_SUBSTITUTIONS, **substitutions}) return f return decorator diff --git a/swh/web/api/apiresponse.py b/swh/web/api/apiresponse.py index fa2ccd50..95c89687 100644 --- a/swh/web/api/apiresponse.py +++ b/swh/web/api/apiresponse.py @@ -1,231 +1,230 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import traceback from typing import Any, Dict, Optional from django.http import HttpResponse from django.shortcuts import render from django.utils.cache import add_never_cache_headers from django.utils.html import escape from rest_framework.exceptions import APIException from rest_framework.request import Request from rest_framework.response import Response from rest_framework.utils.encoders import JSONEncoder from swh.storage.exc import StorageAPIError, StorageDBError from swh.web.api import utils from swh.web.common.exc import ( BadInputExc, ForbiddenExc, LargePayloadExc, NotFoundExc, sentry_capture_exception, ) from swh.web.common.utils import gen_path_info, shorten_path from swh.web.config import get_config logger = logging.getLogger("django") def compute_link_header(rv: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]: """Add Link header in returned value results. Args: request: a DRF Request object rv (dict): dictionary with keys: - headers: potential headers with 'link-next' and 'link-prev' keys - results: containing the result to return options (dict): the initial dict to update with result if any Returns: dict: dictionary with optional keys 'link-next' and 'link-prev' """ link_headers = [] if "headers" not in rv: return {} rv_headers = rv["headers"] if "link-next" in rv_headers: link_headers.append('<%s>; rel="next"' % rv_headers["link-next"]) if "link-prev" in rv_headers: link_headers.append('<%s>; rel="previous"' % rv_headers["link-prev"]) if link_headers: link_header_str = ",".join(link_headers) headers = options.get("headers", {}) headers.update({"Link": link_header_str}) return headers return {} def filter_by_fields(request: Request, data: Dict[str, Any]) -> Dict[str, Any]: """Extract a request parameter 'fields' if it exists to permit the filtering on the data dict's keys. If such field is not provided, returns the data as is. """ fields = request.query_params.get("fields") if fields: data = utils.filter_field_keys(data, set(fields.split(","))) return data def transform(rv: Dict[str, Any]) -> Dict[str, Any]: """Transform an eventual returned value with multiple layer of information with only what's necessary. If the returned value rv contains the 'results' key, this is the associated value which is returned. Otherwise, return the initial dict without the potential 'headers' key. """ if "results" in rv: return rv["results"] if "headers" in rv: rv.pop("headers") return rv def make_api_response( request: Request, data: Dict[str, Any], doc_data: Optional[Dict[str, Any]] = None, options: Optional[Dict[str, Any]] = None, ) -> HttpResponse: """Generates an API response based on the requested mimetype. Args: request: a DRF Request object data: raw data to return in the API response doc_data: documentation data for HTML response options: optional data that can be used to generate the response Returns: a DRF Response a object """ options = options or {} if data: options["headers"] = compute_link_header(data, options) data = transform(data) data = filter_by_fields(request, data) doc_data = doc_data or {} headers = {} if "headers" in options: doc_data["headers_data"] = options["headers"] headers = options["headers"] # get request status code doc_data["status_code"] = options.get("status", 200) accepted_media_type = getattr(request, "accepted_media_type", "application/json") # when requesting HTML, typically when browsing the API through its # documented views, we need to enrich the input data with documentation # and render the apidoc HTML template if accepted_media_type == "text/html": doc_data["response_data"] = data if data is not None: doc_data["response_data"] = json.dumps( data, cls=JSONEncoder, sort_keys=True, indent=4, separators=(",", ": ") ) doc_data["heading"] = shorten_path(str(request.path)) # generate breadcrumbs data if "route" in doc_data: doc_data["endpoint_path"] = gen_path_info(doc_data["route"]) for i in range(len(doc_data["endpoint_path"]) - 1): doc_data["endpoint_path"][i]["path"] += "/doc/" if not doc_data["noargs"]: doc_data["endpoint_path"][-1]["path"] += "/doc/" response = render( request, "api/apidoc.html", doc_data, status=doc_data["status_code"] ) # otherwise simply return the raw data and let DRF picks # the correct renderer (JSON or YAML) else: response = Response( data, headers=headers, content_type=accepted_media_type, status=doc_data["status_code"], ) if getattr(request, "never_cache", False): add_never_cache_headers(response) return response def error_response( request: Request, exception: Exception, doc_data: Dict[str, Any] ) -> HttpResponse: """Private function to create a custom error response. Args: request: a DRF Request object error: the exception that caused the error doc_data: documentation data for HTML response """ error_code = 500 if isinstance(exception, BadInputExc): error_code = 400 elif isinstance(exception, NotFoundExc): error_code = 404 elif isinstance(exception, ForbiddenExc): error_code = 403 elif isinstance(exception, LargePayloadExc): error_code = 413 elif isinstance(exception, StorageDBError): error_code = 503 elif isinstance(exception, StorageAPIError): error_code = 503 elif isinstance(exception, APIException): error_code = exception.status_code error_opts = {"status": error_code} error_data = { "exception": exception.__class__.__name__, "reason": str(exception), } if getattr(request, "accepted_media_type", None) == "text/html": error_data["reason"] = escape(error_data["reason"]) if get_config()["debug"]: error_data["traceback"] = traceback.format_exc() logger.debug(error_data["traceback"]) return make_api_response(request, error_data, doc_data, options=error_opts) def error_response_handler( exc: Exception, context: Dict[str, Any] ) -> Optional[HttpResponse]: - """Custom DRF exception handler used to generate API error responses. - """ + """Custom DRF exception handler used to generate API error responses.""" sentry_capture_exception(exc) doc_data = getattr(exc, "doc_data", {}) return error_response(context["request"], exc, doc_data) diff --git a/swh/web/api/throttling.py b/swh/web/api/throttling.py index 66afa8b3..24e7f11b 100644 --- a/swh/web/api/throttling.py +++ b/swh/web/api/throttling.py @@ -1,215 +1,217 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from ipaddress import IPv4Network, IPv6Network, ip_address, ip_network from typing import Callable, List, TypeVar, Union import sentry_sdk from django.core.exceptions import ImproperlyConfigured import rest_framework from rest_framework.throttling import ScopedRateThrottle from swh.web.auth.utils import API_SAVE_ORIGIN_PERMISSION from swh.web.config import get_config APIView = TypeVar("APIView", bound="rest_framework.views.APIView") Request = rest_framework.request.Request API_THROTTLING_EXEMPTED_PERM = "swh.web.api.throttling_exempted" class SwhWebRateThrottle(ScopedRateThrottle): """Custom DRF request rate limiter for anonymous users Requests are grouped into scopes. It enables to apply different requests rate limiting based on the scope name but also the input HTTP request types. To associate a scope to requests, one must add a 'throttle_scope' attribute when using a class based view, or call the 'throttle_scope' decorator when using a function based view. By default, requests do not have an associated scope and are not rate limited. Rate limiting can also be configured according to the type of the input HTTP requests for fine grained tuning. For instance, the following YAML configuration section sets a rate of: - 1 per minute for POST requests - 60 per minute for other request types for the 'swh_api' scope while exempting those coming from the 127.0.0.0/8 ip network. .. code-block:: yaml throttling: scopes: swh_api: limiter_rate: default: 60/m POST: 1/m exempted_networks: - 127.0.0.0/8 """ scope = None def __init__(self): super().__init__() self.exempted_networks = None self.num_requests = 0 self.duration = 0 def get_cache_key(self, request, view): # do not handle throttling if user is authenticated if request.user.is_authenticated: return None else: return super().get_cache_key(request, view) def get_exempted_networks( self, scope_name: str ) -> List[Union[IPv4Network, IPv6Network]]: if not self.exempted_networks: scopes = get_config()["throttling"]["scopes"] scope = scopes.get(scope_name) if scope: networks = scope.get("exempted_networks") if networks: self.exempted_networks = [ ip_network(network) for network in networks ] return self.exempted_networks def get_scope(self, view: APIView): if not self.scope: # class based view case return getattr(view, self.scope_attr, None) else: # function based view case return self.scope def allow_request(self, request: Request, view: APIView) -> bool: # class based view case if not self.scope: default_scope = getattr(view, self.scope_attr, None) request_allowed = None if default_scope is not None: # check if there is a specific rate limiting associated # to the request type assert request.method is not None request_scope = f"{default_scope}_{request.method.lower()}" setattr(view, self.scope_attr, request_scope) try: request_allowed = super().allow_request(request, view) # use default rate limiting otherwise except ImproperlyConfigured as exc: sentry_sdk.capture_exception(exc) setattr(view, self.scope_attr, default_scope) if request_allowed is None: request_allowed = super().allow_request(request, view) # function based view case else: default_scope = self.scope # check if there is a specific rate limiting associated # to the request type self.scope = default_scope + "_" + request.method.lower() try: self.rate = self.get_rate() # use default rate limiting otherwise except ImproperlyConfigured: self.scope = default_scope self.rate = self.get_rate() self.num_requests, self.duration = self.parse_rate(self.rate) request_allowed = super(ScopedRateThrottle, self).allow_request( request, view ) self.scope = default_scope exempted_networks = self.get_exempted_networks(default_scope) exempted_ip = False if exempted_networks: remote_address = ip_address(self.get_ident(request)) exempted_ip = any( remote_address in network for network in exempted_networks ) request_allowed = exempted_ip or request_allowed # set throttling related data in the request metadata # in order for the ThrottlingHeadersMiddleware to # add X-RateLimit-* headers in the HTTP response if not exempted_ip and hasattr(self, "history"): hit_count = len(self.history) request.META["RateLimit-Limit"] = self.num_requests request.META["RateLimit-Remaining"] = self.num_requests - hit_count wait = self.wait() if wait is not None: request.META["RateLimit-Reset"] = int(self.now + wait) return request_allowed class SwhWebUserRateThrottle(SwhWebRateThrottle): """Custom DRF request rate limiter for authenticated users It has the same behavior than :class:`swh.web.api.throttling.SwhWebRateThrottle` except the number of allowed requests for each throttle scope is increased by a 1Ox factor. """ NUM_REQUESTS_FACTOR = 10 def get_cache_key(self, request, view): # do not handle throttling if user is not authenticated if request.user.is_authenticated: return super(SwhWebRateThrottle, self).get_cache_key(request, view) else: return None def parse_rate(self, rate): # increase number of allowed requests num_requests, duration = super().parse_rate(rate) return (num_requests * self.NUM_REQUESTS_FACTOR, duration) def allow_request(self, request: Request, view: APIView) -> bool: if request.user.is_staff or request.user.has_perm(API_THROTTLING_EXEMPTED_PERM): # no throttling for staff users or users with adequate permission return True scope = self.get_scope(view) if scope == "save_origin" and request.user.has_perm(API_SAVE_ORIGIN_PERMISSION): # no throttling on save origin endpoint for users with adequate permission return True return super().allow_request(request, view) def throttle_scope(scope: str) -> Callable[..., APIView]: """Decorator that allows the throttle scope of a DRF function based view to be set:: @api_view(['GET', ]) @throttle_scope('scope') def view(request): ... """ def decorator(func: APIView) -> APIView: SwhScopeRateThrottle = type( "SwhWebScopeRateThrottle", (SwhWebRateThrottle,), {"scope": scope} ) SwhScopeUserRateThrottle = type( - "SwhWebScopeUserRateThrottle", (SwhWebUserRateThrottle,), {"scope": scope}, + "SwhWebScopeUserRateThrottle", + (SwhWebUserRateThrottle,), + {"scope": scope}, ) func.throttle_classes = (SwhScopeRateThrottle, SwhScopeUserRateThrottle) return func return decorator diff --git a/swh/web/api/utils.py b/swh/web/api/utils.py index fed85b45..36b003dc 100644 --- a/swh/web/api/utils.py +++ b/swh/web/api/utils.py @@ -1,352 +1,350 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, List, Optional, Tuple from django.http import HttpRequest from swh.web.common.query import parse_hash from swh.web.common.utils import resolve_branch_alias, reverse def filter_field_keys(data, field_keys): """Given an object instance (directory or list), and a csv field keys to filter on. Return the object instance with filtered keys. Note: Returns obj as is if it's an instance of types not in (dictionary, list) Args: - data: one object (dictionary, list...) to filter. - field_keys: csv or set of keys to filter the object on Returns: obj filtered on field_keys """ if isinstance(data, map): return map(lambda x: filter_field_keys(x, field_keys), data) if isinstance(data, list): return [filter_field_keys(x, field_keys) for x in data] if isinstance(data, dict): return {k: v for (k, v) in data.items() if k in field_keys} return data def person_to_string(person): - """Map a person (person, committer, tagger, etc...) to a string. - - """ + """Map a person (person, committer, tagger, etc...) to a string.""" return "".join([person["name"], " <", person["email"], ">"]) def enrich_object( object: Dict[str, str], request: Optional[HttpRequest] = None ) -> Dict[str, str]: """Enrich an object (revision, release) with link to the 'target' of type 'target_type'. Args: object: An object with target and target_type keys (e.g. release, revision) request: Absolute URIs will be generated if provided Returns: Object enriched with target object url (revision, release, content, directory) """ if "target" in object and "target_type" in object: if object["target_type"] in ("revision", "release", "directory"): object["target_url"] = reverse( "api-1-%s" % object["target_type"], url_args={"sha1_git": object["target"]}, request=request, ) elif object["target_type"] == "content": object["target_url"] = reverse( "api-1-content", url_args={"q": "sha1_git:" + object["target"]}, request=request, ) elif object["target_type"] == "snapshot": object["target_url"] = reverse( "api-1-snapshot", url_args={"snapshot_id": object["target"]}, request=request, ) return object enrich_release = enrich_object def enrich_directory_entry( directory: Dict[str, str], request: Optional[HttpRequest] = None ) -> Dict[str, str]: """Enrich directory entry with url to target. Args: directory: dict of data associated to a swh directory entry request: Absolute URIs will be generated if provided Returns: An enriched directory dict filled with additional url """ if "type" in directory: target_type = directory["type"] target = directory["target"] if target_type == "file": directory["target_url"] = reverse( "api-1-content", url_args={"q": "sha1_git:%s" % target}, request=request ) elif target_type == "dir": directory["target_url"] = reverse( "api-1-directory", url_args={"sha1_git": target}, request=request ) else: directory["target_url"] = reverse( "api-1-revision", url_args={"sha1_git": target}, request=request ) return directory def enrich_metadata_endpoint( content_metadata: Dict[str, str], request: Optional[HttpRequest] = None ) -> Dict[str, str]: """Enrich content metadata dict with link to the upper metadata endpoint. Args: content_metadata: dict of data associated to a swh content metadata request: Absolute URIs will be generated if provided Returns: An enriched content metadata dict filled with an additional url """ c = content_metadata c["content_url"] = reverse( "api-1-content", url_args={"q": "sha1:%s" % c["id"]}, request=request ) return c def enrich_content( content: Dict[str, Any], top_url: Optional[bool] = False, query_string: Optional[str] = None, request: Optional[HttpRequest] = None, ) -> Dict[str, str]: """Enrich content with links to: - data_url: its raw data - filetype_url: its filetype information - language_url: its programming language information - license_url: its licensing information Args: content: dict of data associated to a swh content object top_url: whether or not to include the content url in the enriched data query_string: optional query string of type '<algo>:<hash>' used when requesting the content, it acts as a hint for picking the same hash method when computing the url listed above request: Absolute URIs will be generated if provided Returns: An enriched content dict filled with additional urls """ checksums = content if "checksums" in content: checksums = content["checksums"] hash_algo = "sha1" if query_string: hash_algo = parse_hash(query_string)[0] if hash_algo in checksums: q = "%s:%s" % (hash_algo, checksums[hash_algo]) if top_url: content["content_url"] = reverse("api-1-content", url_args={"q": q}) content["data_url"] = reverse( "api-1-content-raw", url_args={"q": q}, request=request ) content["filetype_url"] = reverse( "api-1-content-filetype", url_args={"q": q}, request=request ) content["language_url"] = reverse( "api-1-content-language", url_args={"q": q}, request=request ) content["license_url"] = reverse( "api-1-content-license", url_args={"q": q}, request=request ) return content def enrich_revision( revision: Dict[str, Any], request: Optional[HttpRequest] = None ) -> Dict[str, Any]: """Enrich revision with links where it makes sense (directory, parents). Keep track of the navigation breadcrumbs if they are specified. Args: revision: the revision as a dict request: Absolute URIs will be generated if provided Returns: An enriched revision dict filled with additional urls """ revision["url"] = reverse( "api-1-revision", url_args={"sha1_git": revision["id"]}, request=request ) revision["history_url"] = reverse( "api-1-revision-log", url_args={"sha1_git": revision["id"]}, request=request ) if "directory" in revision: revision["directory_url"] = reverse( "api-1-directory", url_args={"sha1_git": revision["directory"]}, request=request, ) if "parents" in revision: parents = [] for parent in revision["parents"]: parents.append( { "id": parent, "url": reverse( "api-1-revision", url_args={"sha1_git": parent}, request=request ), } ) revision["parents"] = tuple(parents) if "children" in revision: children = [] for child in revision["children"]: children.append( reverse("api-1-revision", url_args={"sha1_git": child}, request=request) ) revision["children_urls"] = children if "decoding_failures" in revision and "message" in revision["decoding_failures"]: revision["message_url"] = reverse( "api-1-revision-raw-message", url_args={"sha1_git": revision["id"]}, request=request, ) return revision def enrich_snapshot( snapshot: Dict[str, Any], request: Optional[HttpRequest] = None ) -> Dict[str, Any]: """Enrich snapshot with links to the branch targets Args: snapshot: the snapshot as a dict request: Absolute URIs will be generated if provided Returns: An enriched snapshot dict filled with additional urls """ if "branches" in snapshot: snapshot["branches"] = { k: enrich_object(v, request) if v else None for k, v in snapshot["branches"].items() } for k, v in snapshot["branches"].items(): if v and v["target_type"] == "alias": branch = resolve_branch_alias(snapshot, v) if branch: branch = enrich_object(branch, request) v["target_url"] = branch["target_url"] return snapshot def enrich_origin( origin: Dict[str, Any], request: Optional[HttpRequest] = None ) -> Dict[str, Any]: """Enrich origin dict with link to its visits Args: origin: the origin as a dict request: Absolute URIs will be generated if provided Returns: An enriched origin dict filled with an additional url """ if "url" in origin: origin["origin_visits_url"] = reverse( "api-1-origin-visits", url_args={"origin_url": origin["url"]}, request=request, ) return origin def enrich_origin_search_result( origin_search_result: Tuple[List[Dict[str, Any]], Optional[str]], request: Optional[HttpRequest] = None, ) -> Tuple[List[Dict[str, Any]], Optional[str]]: """Enrich origin search result with additional links Args: origin_search_result: tuple returned when searching origins request: Absolute URIs will be generated if provided Returns: An enriched origin search result filled with additional urls """ origins, page_token = origin_search_result return [enrich_origin(origin, request=request) for origin in origins], page_token def enrich_origin_visit( origin_visit: Dict[str, Any], *, with_origin_link: bool, with_origin_visit_link: bool, request: Optional[HttpRequest] = None, ) -> Dict[str, Any]: """Enrich origin visit dict with additional links Args: origin_visit: the origin visit as a dict with_origin_link: whether to add link to origin with_origin_visit_link: whether to add link to origin visit request: Absolute URIs will be generated if provided Returns: An enriched origin visit dict filled with additional urls """ ov = origin_visit if with_origin_link: ov["origin_url"] = reverse( "api-1-origin", url_args={"origin_url": ov["origin"]}, request=request ) if with_origin_visit_link: ov["origin_visit_url"] = reverse( "api-1-origin-visit", url_args={"origin_url": ov["origin"], "visit_id": ov["visit"]}, request=request, ) snapshot = ov["snapshot"] if snapshot: ov["snapshot_url"] = reverse( "api-1-snapshot", url_args={"snapshot_id": snapshot}, request=request ) else: ov["snapshot_url"] = None return ov diff --git a/swh/web/api/views/add_forge_now.py b/swh/web/api/views/add_forge_now.py index ae2f0bb6..55917bc6 100644 --- a/swh/web/api/views/add_forge_now.py +++ b/swh/web/api/views/add_forge_now.py @@ -1,357 +1,367 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json from typing import Any, Dict, Union from django.core.exceptions import ObjectDoesNotExist from django.core.paginator import Paginator from django.db import transaction from django.forms import CharField, ModelForm from django.http import HttpResponseBadRequest from django.http.request import HttpRequest from django.http.response import HttpResponse, HttpResponseForbidden from rest_framework import serializers from rest_framework.request import Request from rest_framework.response import Response from swh.web.add_forge_now.models import Request as AddForgeRequest from swh.web.add_forge_now.models import RequestActorRole as AddForgeNowRequestActorRole from swh.web.add_forge_now.models import RequestHistory as AddForgeNowRequestHistory from swh.web.add_forge_now.models import RequestStatus as AddForgeNowRequestStatus from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.auth.utils import ADD_FORGE_MODERATOR_PERMISSION from swh.web.common.exc import BadInputExc from swh.web.common.utils import has_add_forge_now_permission, reverse def _block_while_testing(): - """Replaced by tests to check concurrency behavior - """ + """Replaced by tests to check concurrency behavior""" pass class AddForgeNowRequestForm(ModelForm): - forge_contact_comment = CharField(required=False,) + forge_contact_comment = CharField( + required=False, + ) class Meta: model = AddForgeRequest fields = ( "forge_type", "forge_url", "forge_contact_email", "forge_contact_name", "forge_contact_comment", "submitter_forward_username", ) class AddForgeNowRequestHistoryForm(ModelForm): - new_status = CharField(max_length=200, required=False,) + new_status = CharField( + max_length=200, + required=False, + ) class Meta: model = AddForgeNowRequestHistory fields = ("text", "new_status") class AddForgeNowRequestSerializer(serializers.ModelSerializer): class Meta: model = AddForgeRequest fields = "__all__" class AddForgeNowRequestPublicSerializer(serializers.ModelSerializer): - """Serializes AddForgeRequest without private fields. - """ + """Serializes AddForgeRequest without private fields.""" class Meta: model = AddForgeRequest fields = ("id", "forge_url", "forge_type", "status", "submission_date") class AddForgeNowRequestHistorySerializer(serializers.ModelSerializer): class Meta: model = AddForgeNowRequestHistory exclude = ("request",) class AddForgeNowRequestHistoryPublicSerializer(serializers.ModelSerializer): class Meta: model = AddForgeNowRequestHistory fields = ("id", "date", "new_status", "actor_role") @api_route( - r"/add-forge/request/create/", "api-1-add-forge-request-create", methods=["POST"], + r"/add-forge/request/create/", + "api-1-add-forge-request-create", + methods=["POST"], ) @api_doc("/add-forge/request/create") @format_docstring() @transaction.atomic def api_add_forge_request_create(request: Union[HttpRequest, Request]) -> HttpResponse: """ .. http:post:: /api/1/add-forge/request/create/ Create a new request to add a forge to the list of those crawled regularly by Software Heritage. .. warning:: That endpoint is not publicly available and requires authentication in order to be able to request it. {common_headers} :<json string forge_type: the type of forge :<json string forge_url: the base URL of the forge :<json string forge_contact_email: email of an administator of the forge to contact :<json string forge_contact_name: the name of the administrator :<json string forge_contact_comment: to explain how Software Heritage can verify forge administrator info are valid :statuscode 201: request successfully created :statuscode 400: missing or invalid field values :statuscode 403: user not authenticated """ if not request.user.is_authenticated: return HttpResponseForbidden( "You must be authenticated to create a new add-forge request" ) add_forge_request = AddForgeRequest() if isinstance(request, Request): # request submitted with request body in JSON (goes through DRF) form = AddForgeNowRequestForm(request.data, instance=add_forge_request) else: # request submitted with request body in form encoded format # (directly handled by Django) form = AddForgeNowRequestForm(request.POST, instance=add_forge_request) if form.errors: raise BadInputExc(json.dumps(form.errors)) try: existing_request = AddForgeRequest.objects.get( forge_url=add_forge_request.forge_url ) except ObjectDoesNotExist: pass else: return Response( f"Request for forge already exists (id {existing_request.id})", status=409, # Conflict ) add_forge_request.submitter_name = request.user.username add_forge_request.submitter_email = request.user.email form.save() request_history = AddForgeNowRequestHistory() request_history.request = add_forge_request request_history.new_status = AddForgeNowRequestStatus.PENDING.name request_history.actor = request.user.username request_history.actor_role = AddForgeNowRequestActorRole.SUBMITTER.name request_history.save() data = AddForgeNowRequestSerializer(add_forge_request).data return Response(data=data, status=201) @api_route( r"/add-forge/request/(?P<id>[0-9]+)/update/", "api-1-add-forge-request-update", methods=["POST"], ) @api_doc("/add-forge/request/update", tags=["hidden"]) @format_docstring() @transaction.atomic def api_add_forge_request_update( request: Union[HttpRequest, Request], id: int ) -> HttpResponse: """ .. http:post:: /api/1/add-forge/request/update/ Update a request to add a forge to the list of those crawled regularly by Software Heritage. .. warning:: That endpoint is not publicly available and requires authentication in order to be able to request it. {common_headers} :<json string text: comment about new request status :<json string new_status: the new request status :statuscode 200: request successfully updated :statuscode 400: missing or invalid field values :statuscode 403: user is not a moderator """ if not request.user.is_authenticated: return HttpResponseForbidden( "You must be authenticated to update a new add-forge request" ) if not has_add_forge_now_permission(request.user): return HttpResponseForbidden("You are not a moderator") add_forge_request = ( AddForgeRequest.objects.filter(id=id).select_for_update().first() ) if add_forge_request is None: return HttpResponseBadRequest("Invalid request id") request_history = AddForgeNowRequestHistory() request_history.request = add_forge_request if isinstance(request, Request): # request submitted with request body in JSON (goes through DRF) form = AddForgeNowRequestHistoryForm(request.data, instance=request_history) else: # request submitted with request body in form encoded format # (directly handled by Django) form = AddForgeNowRequestHistoryForm(request.POST, instance=request_history) if form.errors: raise BadInputExc(json.dumps(form.errors)) new_status_str = form["new_status"].value() if new_status_str is not None: new_status = AddForgeNowRequestStatus[new_status_str] current_status = AddForgeNowRequestStatus[add_forge_request.status] if new_status not in current_status.allowed_next_statuses(): raise BadInputExc( f"New request status {new_status} cannot be reached " f"from current status {add_forge_request.status}" ) _block_while_testing() request_history.actor = request.user.username request_history.actor_role = AddForgeNowRequestActorRole.MODERATOR.name form.save(commit=False) if request_history.new_status == "": request_history.new_status = None request_history.save() if request_history.new_status is not None: add_forge_request.status = request_history.new_status add_forge_request.save() data = AddForgeNowRequestSerializer(add_forge_request).data return Response(data=data, status=200) @api_route( - r"/add-forge/request/list/", "api-1-add-forge-request-list", methods=["GET"], + r"/add-forge/request/list/", + "api-1-add-forge-request-list", + methods=["GET"], ) @api_doc("/add-forge/request/list") @format_docstring() def api_add_forge_request_list(request: Request): """ .. http:get:: /api/1/add-forge/request/list/ List add forge requests submitted by users. {common_headers} {resheader_link} :query int page: optional page number :query int per_page: optional number of elements per page (bounded to 1000) :statuscode 200: always """ add_forge_requests = AddForgeRequest.objects.order_by("-id") page_num = int(request.GET.get("page", 1)) per_page = int(request.GET.get("per_page", 10)) per_page = min(per_page, 1000) if ( int(request.GET.get("user_requests_only", "0")) and request.user.is_authenticated ): add_forge_requests = add_forge_requests.filter( submitter_name=request.user.username ) paginator = Paginator(add_forge_requests, per_page) page = paginator.page(page_num) if request.user.has_perm(ADD_FORGE_MODERATOR_PERMISSION): requests = AddForgeNowRequestSerializer(page.object_list, many=True).data else: requests = AddForgeNowRequestPublicSerializer(page.object_list, many=True).data results = [dict(request) for request in requests] response: Dict[str, Any] = {"results": results, "headers": {}} if page.has_previous(): response["headers"]["link-prev"] = reverse( "api-1-add-forge-request-list", - query_params={"page": page.previous_page_number(), "per_page": per_page,}, + query_params={ + "page": page.previous_page_number(), + "per_page": per_page, + }, request=request, ) if page.has_next(): response["headers"]["link-next"] = reverse( "api-1-add-forge-request-list", query_params={"page": page.next_page_number(), "per_page": per_page}, request=request, ) return response @api_route( r"/add-forge/request/(?P<id>[0-9]+)/get/", "api-1-add-forge-request-get", methods=["GET"], ) @api_doc("/add-forge/request/get") @format_docstring() def api_add_forge_request_get(request: Request, id: int): """ .. http:get:: /api/1/add-forge/request/get/ Return all details about an add-forge request. {common_headers} :param int id: add-forge request identifier :statuscode 200: request details successfully returned :statuscode 400: request identifier does not exist """ try: add_forge_request = AddForgeRequest.objects.get(id=id) except ObjectDoesNotExist: raise BadInputExc("Request id does not exist") request_history = AddForgeNowRequestHistory.objects.filter( request=add_forge_request ).order_by("id") if request.user.is_authenticated and request.user.has_perm( ADD_FORGE_MODERATOR_PERMISSION ): data = AddForgeNowRequestSerializer(add_forge_request).data history = AddForgeNowRequestHistorySerializer(request_history, many=True).data else: data = AddForgeNowRequestPublicSerializer(add_forge_request).data history = AddForgeNowRequestHistoryPublicSerializer( request_history, many=True ).data return {"request": data, "history": history} diff --git a/swh/web/api/views/revision.py b/swh/web/api/views/revision.py index dd941c45..afd05512 100644 --- a/swh/web/api/views/revision.py +++ b/swh/web/api/views/revision.py @@ -1,214 +1,213 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.http import HttpResponse from swh.web.api import utils from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup from swh.web.common import archive DOC_RETURN_REVISION = """ :>json object author: information about the author of the revision :>json object committer: information about the committer of the revision :>json string committer_date: RFC3339 representation of the commit date :>json string date: RFC3339 representation of the revision date :>json string directory: the unique identifier that revision points to :>json string directory_url: link to :http:get:`/api/1/directory/(sha1_git)/[(path)/]` to get information about the directory associated to the revision :>json string id: the revision unique identifier :>json boolean merge: whether or not the revision corresponds to a merge commit :>json string message: the message associated to the revision :>json array parents: the parents of the revision, i.e. the previous revisions that head directly to it, each entry of that array contains an unique parent revision identifier but also a link to :http:get:`/api/1/revision/(sha1_git)/` to get more information about it :>json string type: the type of the revision """ DOC_RETURN_REVISION_ARRAY = DOC_RETURN_REVISION.replace(":>json", ":>jsonarr") @api_route( r"/revision/(?P<sha1_git>[0-9a-f]+)/", "api-1-revision", checksum_args=["sha1_git"] ) @api_doc("/revision/") @format_docstring(return_revision=DOC_RETURN_REVISION) def api_revision(request, sha1_git): """ .. http:get:: /api/1/revision/(sha1_git)/ Get information about a revision in the archive. Revisions are identified by **sha1** checksums, compatible with Git commit identifiers. See :func:`swh.model.git_objects.revision_git_object` in our data model module for details about how they are computed. :param string sha1_git: hexadecimal representation of the revision **sha1_git** identifier {common_headers} {return_revision} :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: requested revision can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/` """ return api_lookup( archive.lookup_revision, sha1_git, notfound_msg="Revision with sha1_git {} not found.".format(sha1_git), enrich_fn=utils.enrich_revision, request=request, ) @api_route( r"/revision/(?P<sha1_git>[0-9a-f]+)/raw/", "api-1-revision-raw-message", checksum_args=["sha1_git"], ) @api_doc("/revision/raw/", tags=["hidden"]) def api_revision_raw_message(request, sha1_git): - """Return the raw data of the message of revision identified by sha1_git - """ + """Return the raw data of the message of revision identified by sha1_git""" raw = archive.lookup_revision_message(sha1_git) response = HttpResponse(raw["message"], content_type="application/octet-stream") response["Content-disposition"] = "attachment;filename=rev_%s_raw" % sha1_git return response @api_route( r"/revision/(?P<sha1_git>[0-9a-f]+)/directory/", "api-1-revision-directory", checksum_args=["sha1_git"], ) @api_route( r"/revision/(?P<sha1_git>[0-9a-f]+)/directory/(?P<dir_path>.+)/", "api-1-revision-directory", checksum_args=["sha1_git"], ) @api_doc("/revision/directory/") @format_docstring() def api_revision_directory(request, sha1_git, dir_path=None, with_data=False): """ .. http:get:: /api/1/revision/(sha1_git)/directory/[(path)/] Get information about directory (entry) objects associated to revisions. Each revision is associated to a single "root" directory. This endpoint behaves like :http:get:`/api/1/directory/(sha1_git)/[(path)/]`, but operates on the root directory associated to a given revision. :param string sha1_git: hexadecimal representation of the revision **sha1_git** identifier :param string path: optional parameter to get information about the directory entry pointed by that relative path {common_headers} :>json array content: directory entries as returned by :http:get:`/api/1/directory/(sha1_git)/[(path)/]` :>json string path: path of directory from the revision root one :>json string revision: the unique revision identifier :>json string type: the type of the directory :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: requested revision can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/f1b94134a4b879bc55c3dacdb496690c8ebdc03f/directory/` """ rev_id, result = archive.lookup_directory_through_revision( {"sha1_git": sha1_git}, dir_path, with_data=with_data ) content = result["content"] if result["type"] == "dir": # dir_entries result["content"] = [ utils.enrich_directory_entry(entry, request=request) for entry in content ] elif result["type"] == "file": # content result["content"] = utils.enrich_content(content, request=request) elif result["type"] == "rev": # revision result["content"] = utils.enrich_revision(content, request=request) return result @api_route( r"/revision/(?P<sha1_git>[0-9a-f]+)/log/", "api-1-revision-log", checksum_args=["sha1_git"], ) @api_doc("/revision/log/") @format_docstring(return_revision_array=DOC_RETURN_REVISION_ARRAY) def api_revision_log(request, sha1_git): """ .. http:get:: /api/1/revision/(sha1_git)/log/ Get a list of all revisions heading to a given one, in other words show the commit log. The revisions are returned in the breadth-first search order while visiting the revision graph. The number of revisions to return is also bounded by the **limit** query parameter. .. warning:: To get the full BFS traversal of the revision graph when the total number of revisions is greater than 1000, it is up to the client to keep track of the multiple branches of history when there's merge revisions in the returned objects. In other words, identify all the continuation points that need to be followed to get the full history through recursion. :param string sha1_git: hexadecimal representation of the revision **sha1_git** identifier :query int limit: maximum number of revisions to return when performing BFS traversal on the revision graph (default to 10, can not exceed 1000) {common_headers} {return_revision_array} :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: head revision can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/e1a315fa3fa734e2a6154ed7b5b9ae0eb8987aad/log/` """ limit = int(request.query_params.get("limit", "10")) limit = min(limit, 1000) error_msg = "Revision with sha1_git %s not found." % sha1_git revisions = api_lookup( archive.lookup_revision_log, sha1_git, limit, notfound_msg=error_msg, enrich_fn=utils.enrich_revision, request=request, ) return {"results": revisions} diff --git a/swh/web/api/views/utils.py b/swh/web/api/views/utils.py index a1db1274..d6fb6b19 100644 --- a/swh/web/api/views/utils.py +++ b/swh/web/api/views/utils.py @@ -1,94 +1,92 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from types import GeneratorType from typing import Any, Callable, Dict, Mapping, Optional from typing_extensions import Protocol from django.http import HttpRequest from rest_framework.decorators import api_view from rest_framework.response import Response from swh.web.api.apiurls import APIUrls, api_route from swh.web.common.exc import NotFoundExc class EnrichFunction(Protocol): def __call__( self, input: Mapping[str, str], request: Optional[HttpRequest] ) -> Dict[str, str]: ... def api_lookup( lookup_fn: Callable[..., Any], *args: Any, notfound_msg: Optional[str] = "Object not found", enrich_fn: Optional[EnrichFunction] = None, request: Optional[HttpRequest] = None, **kwargs: Any, ): r""" Capture a redundant behavior of: - looking up the backend with a criteria (be it an identifier or checksum) passed to the function lookup_fn - if nothing is found, raise an NotFoundExc exception with error message notfound_msg. - Otherwise if something is returned: - either as list, map or generator, map the enrich_fn function to it and return the resulting data structure as list. - either as dict and pass to enrich_fn and return the dict enriched. Args: - lookup_fn: function expects one criteria and optional supplementary \*args. - \*args: supplementary arguments to pass to lookup_fn. - notfound_msg: if nothing matching the criteria is found, raise NotFoundExc with this error message. - enrich_fn: Function to use to enrich the result returned by lookup_fn. Default to the identity function if not provided. - request: Input HTTP request that will be provided as parameter to enrich_fn. Raises: NotFoundExp or whatever `lookup_fn` raises. """ def _enrich_fn_noop(x, request): return x if enrich_fn is None: enrich_fn = _enrich_fn_noop res = lookup_fn(*args, **kwargs) if res is None: raise NotFoundExc(notfound_msg) if isinstance(res, (list, GeneratorType)) or type(res) == map: return [enrich_fn(x, request=request) for x in res] return enrich_fn(res, request=request) @api_view(["GET", "HEAD"]) def api_home(request): return Response({}, template_name="api/api.html") APIUrls.add_url_pattern(r"^$", api_home, view_name="api-1-homepage") @api_route(r"/", "api-1-endpoints") def api_endpoints(request): - """Display the list of opened api endpoints. - - """ + """Display the list of opened api endpoints.""" routes = APIUrls.get_app_endpoints().copy() for route, doc in routes.items(): doc["doc_intro"] = doc["docstring"].split("\n\n")[0] # Return a list of routes with consistent ordering env = {"doc_routes": sorted(routes.items())} return Response(env, template_name="api/endpoints.html") diff --git a/swh/web/api/views/vault.py b/swh/web/api/views/vault.py index 18549e4c..07ad8d9c 100644 --- a/swh/web/api/views/vault.py +++ b/swh/web/api/views/vault.py @@ -1,505 +1,514 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict from django.http import HttpResponse from django.shortcuts import redirect from swh.model.hashutil import hash_to_hex from swh.model.swhids import CoreSWHID, ObjectType from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup from swh.web.common import archive, query from swh.web.common.exc import BadInputExc from swh.web.common.utils import reverse ###################################################### # Common SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" # XXX: a bit spaghetti. Would be better with class-based views. def _dispatch_cook_progress(request, bundle_type: str, swhid: CoreSWHID): if request.method == "GET": return api_lookup( archive.vault_progress, bundle_type, swhid, notfound_msg=f"Cooking of {swhid} was never requested.", request=request, ) elif request.method == "POST": email = request.POST.get("email", request.GET.get("email", None)) return api_lookup( archive.vault_cook, bundle_type, swhid, email, notfound_msg=f"{swhid} not found.", request=request, ) def _vault_response( vault_response: Dict[str, Any], add_legacy_items: bool ) -> Dict[str, Any]: d = { "fetch_url": vault_response["fetch_url"], "progress_message": vault_response["progress_msg"], "id": vault_response["task_id"], "status": vault_response["task_status"], "swhid": str(vault_response["swhid"]), } if add_legacy_items: d["obj_type"] = vault_response["swhid"].object_type.name.lower() d["obj_id"] = hash_to_hex(vault_response["swhid"].object_id) return d ###################################################### # Flat bundles @api_route( f"/vault/flat/(?P<swhid>{SWHID_RE})/", "api-1-vault-cook-flat", methods=["GET", "POST"], throttle_scope="swh_vault_cooking", never_cache=True, ) @api_doc("/vault/flat/") @format_docstring() def api_vault_cook_flat(request, swhid): """ .. http:get:: /api/1/vault/flat/(swhid)/ .. http:post:: /api/1/vault/flat/(swhid)/ Request the cooking of a simple archive, typically for a directory. That endpoint enables to create a vault cooking task for a directory through a POST request or check the status of a previously created one through a GET request. Once the cooking task has been executed, the resulting archive can be downloaded using the dedicated endpoint :http:get:`/api/1/vault/flat/(swhid)/raw/`. Then to extract the cooked directory in the current one, use:: $ tar xvf path/to/swh_1_*.tar.gz :param string swhid: the object's SWHID :query string email: e-mail to notify when the archive is ready {common_headers} :>json string fetch_url: the url from which to download the archive once it has been cooked (see :http:get:`/api/1/vault/flat/(swhid)/raw/`) :>json string progress_message: message describing the cooking task progress :>json number id: the cooking task id :>json string status: the cooking task status (either **new**, **pending**, **done** or **failed**) :>json string swhid: the identifier of the object to cook :statuscode 200: no error :statuscode 400: an invalid directory identifier has been provided :statuscode 404: requested directory did not receive any cooking request yet (in case of GET) or can not be found in the archive (in case of POST) """ swhid = CoreSWHID.from_string(swhid) if swhid.object_type == ObjectType.DIRECTORY: res = _dispatch_cook_progress(request, "flat", swhid) res["fetch_url"] = reverse( - "api-1-vault-fetch-flat", url_args={"swhid": str(swhid)}, request=request, + "api-1-vault-fetch-flat", + url_args={"swhid": str(swhid)}, + request=request, ) return _vault_response(res, add_legacy_items=False) elif swhid.object_type == ObjectType.CONTENT: raise BadInputExc( "Content objects do not need to be cooked, " "use `/api/1/content/raw/` instead." ) elif swhid.object_type == ObjectType.REVISION: # TODO: support revisions too? (the vault allows it) raise BadInputExc( "Only directories can be cooked as 'flat' bundles. " "Use `/api/1/vault/gitfast/` to cook revisions, as gitfast bundles." ) else: raise BadInputExc("Only directories can be cooked as 'flat' bundles.") @api_route( r"/vault/directory/(?P<dir_id>[0-9a-f]+)/", "api-1-vault-cook-directory", methods=["GET", "POST"], checksum_args=["dir_id"], throttle_scope="swh_vault_cooking", never_cache=True, ) @api_doc("/vault/directory/", tags=["deprecated"]) @format_docstring() def api_vault_cook_directory(request, dir_id): """ .. http:get:: /api/1/vault/directory/(dir_id)/ This endpoint was replaced by :http:get:`/api/1/vault/flat/(swhid)/` """ _, obj_id = query.parse_hash_with_algorithms_or_throws( dir_id, ["sha1"], "Only sha1_git is supported." ) swhid = f"swh:1:dir:{obj_id.hex()}" res = _dispatch_cook_progress(request, "flat", CoreSWHID.from_string(swhid)) res["fetch_url"] = reverse( - "api-1-vault-fetch-flat", url_args={"swhid": swhid}, request=request, + "api-1-vault-fetch-flat", + url_args={"swhid": swhid}, + request=request, ) return _vault_response(res, add_legacy_items=True) @api_route( - f"/vault/flat/(?P<swhid>{SWHID_RE})/raw/", "api-1-vault-fetch-flat", + f"/vault/flat/(?P<swhid>{SWHID_RE})/raw/", + "api-1-vault-fetch-flat", ) @api_doc("/vault/flat/raw/") def api_vault_fetch_flat(request, swhid): """ .. http:get:: /api/1/vault/flat/(swhid)/raw/ Fetch the cooked archive for a flat bundle. See :http:get:`/api/1/vault/flat/(swhid)/` to get more details on 'flat' bundle cooking. :param string swhid: the SWHID of the object to cook :resheader Content-Type: application/gzip :statuscode 200: no error :statuscode 404: requested directory did not receive any cooking request yet (in case of GET) or can not be found in the archive (in case of POST) """ res = api_lookup( archive.vault_fetch, "flat", CoreSWHID.from_string(swhid), notfound_msg=f"Cooked archive for {swhid} not found.", request=request, ) fname = "{}.tar.gz".format(swhid) response = HttpResponse(res, content_type="application/gzip") response["Content-disposition"] = "attachment; filename={}".format( fname.replace(":", "_") ) return response @api_route( r"/vault/directory/(?P<dir_id>[0-9a-f]+)/raw/", "api-1-vault-fetch-directory", checksum_args=["dir_id"], ) @api_doc("/vault/directory/raw/", tags=["hidden", "deprecated"]) def api_vault_fetch_directory(request, dir_id): """ .. http:get:: /api/1/vault/directory/(dir_id)/raw/ This endpoint was replaced by :http:get:`/api/1/vault/flat/(swhid)/raw/` """ _, obj_id = query.parse_hash_with_algorithms_or_throws( dir_id, ["sha1"], "Only sha1_git is supported." ) rev_flat_raw_url = reverse( "api-1-vault-fetch-flat", url_args={"swhid": f"swh:1:dir:{dir_id}"} ) return redirect(rev_flat_raw_url) ###################################################### # gitfast bundles @api_route( f"/vault/gitfast/(?P<swhid>{SWHID_RE})/", "api-1-vault-cook-gitfast", methods=["GET", "POST"], throttle_scope="swh_vault_cooking", never_cache=True, ) @api_doc("/vault/gitfast/") @format_docstring() def api_vault_cook_gitfast(request, swhid): """ .. http:get:: /api/1/vault/gitfast/(swhid)/ .. http:post:: /api/1/vault/gitfast/(swhid)/ Request the cooking of a gitfast archive for a revision or check its cooking status. That endpoint enables to create a vault cooking task for a revision through a POST request or check the status of a previously created one through a GET request. Once the cooking task has been executed, the resulting gitfast archive can be downloaded using the dedicated endpoint :http:get:`/api/1/vault/gitfast/(swhid)/raw/`. Then to import the revision in the current directory, use:: $ git init $ zcat path/to/swh_1_rev_*.gitfast.gz | git fast-import $ git checkout HEAD :param string swhid: the revision's permanent identifiers :query string email: e-mail to notify when the gitfast archive is ready {common_headers} :>json string fetch_url: the url from which to download the archive once it has been cooked (see :http:get:`/api/1/vault/gitfast/(swhid)/raw/`) :>json string progress_message: message describing the cooking task progress :>json number id: the cooking task id :>json string status: the cooking task status (new/pending/done/failed) :>json string swhid: the identifier of the object to cook :statuscode 200: no error :statuscode 404: requested directory did not receive any cooking request yet (in case of GET) or can not be found in the archive (in case of POST) """ swhid = CoreSWHID.from_string(swhid) if swhid.object_type == ObjectType.REVISION: res = _dispatch_cook_progress(request, "gitfast", swhid) res["fetch_url"] = reverse( "api-1-vault-fetch-gitfast", url_args={"swhid": str(swhid)}, request=request, ) return _vault_response(res, add_legacy_items=False) elif swhid.object_type == ObjectType.CONTENT: raise BadInputExc( "Content objects do not need to be cooked, " "use `/api/1/content/raw/` instead." ) elif swhid.object_type == ObjectType.DIRECTORY: raise BadInputExc( "Only revisions can be cooked as 'gitfast' bundles. " "Use `/api/1/vault/flat/` to cook directories, as flat bundles." ) else: raise BadInputExc("Only revisions can be cooked as 'gitfast' bundles.") @api_route( r"/vault/revision/(?P<rev_id>[0-9a-f]+)/gitfast/", "api-1-vault-cook-revision_gitfast", methods=["GET", "POST"], checksum_args=["rev_id"], throttle_scope="swh_vault_cooking", never_cache=True, ) @api_doc("/vault/revision/gitfast/", tags=["deprecated"]) @format_docstring() def api_vault_cook_revision_gitfast(request, rev_id): """ .. http:get:: /api/1/vault/revision/(rev_id)/gitfast/ This endpoint was replaced by :http:get:`/api/1/vault/gitfast/(swhid)/` """ _, obj_id = query.parse_hash_with_algorithms_or_throws( rev_id, ["sha1"], "Only sha1_git is supported." ) swhid = f"swh:1:rev:{obj_id.hex()}" res = _dispatch_cook_progress(request, "gitfast", CoreSWHID.from_string(swhid)) res["fetch_url"] = reverse( - "api-1-vault-fetch-gitfast", url_args={"swhid": swhid}, request=request, + "api-1-vault-fetch-gitfast", + url_args={"swhid": swhid}, + request=request, ) return _vault_response(res, add_legacy_items=True) @api_route( - f"/vault/gitfast/(?P<swhid>{SWHID_RE})/raw/", "api-1-vault-fetch-gitfast", + f"/vault/gitfast/(?P<swhid>{SWHID_RE})/raw/", + "api-1-vault-fetch-gitfast", ) @api_doc("/vault/gitfast/raw/") def api_vault_fetch_revision_gitfast(request, swhid): """ .. http:get:: /api/1/vault/gitfast/(swhid)/raw/ Fetch the cooked gitfast archive for a revision. See :http:get:`/api/1/vault/gitfast/(swhid)/` to get more details on gitfast cooking. :param string rev_id: the revision's sha1 identifier :resheader Content-Type: application/gzip :statuscode 200: no error :statuscode 404: requested directory did not receive any cooking request yet (in case of GET) or can not be found in the archive (in case of POST) """ res = api_lookup( archive.vault_fetch, "gitfast", CoreSWHID.from_string(swhid), notfound_msg="Cooked archive for {} not found.".format(swhid), request=request, ) fname = "{}.gitfast.gz".format(swhid) response = HttpResponse(res, content_type="application/gzip") response["Content-disposition"] = "attachment; filename={}".format( fname.replace(":", "_") ) return response @api_route( r"/vault/revision/(?P<rev_id>[0-9a-f]+)/gitfast/raw/", "api-1-vault-fetch-revision_gitfast", checksum_args=["rev_id"], ) @api_doc("/vault/revision_gitfast/raw/", tags=["hidden", "deprecated"]) def _api_vault_revision_gitfast_raw(request, rev_id): """ .. http:get:: /api/1/vault/revision/(rev_id)/gitfast/raw/ This endpoint was replaced by :http:get:`/api/1/vault/gitfast/(swhid)/raw/` """ rev_gitfast_raw_url = reverse( "api-1-vault-fetch-gitfast", url_args={"swhid": f"swh:1:rev:{rev_id}"} ) return redirect(rev_gitfast_raw_url) ###################################################### # git_bare bundles @api_route( f"/vault/git-bare/(?P<swhid>{SWHID_RE})/", "api-1-vault-cook-git-bare", methods=["GET", "POST"], throttle_scope="swh_vault_cooking", never_cache=True, ) @api_doc("/vault/git-bare/") @format_docstring() def api_vault_cook_git_bare(request, swhid): """ .. http:get:: /api/1/vault/git-bare/(swhid)/ .. http:post:: /api/1/vault/git-bare/(swhid)/ Request the cooking of a git-bare archive for a revision or check its cooking status. That endpoint enables to create a vault cooking task for a revision through a POST request or check the status of a previously created one through a GET request. Once the cooking task has been executed, the resulting git-bare archive can be downloaded using the dedicated endpoint :http:get:`/api/1/vault/git-bare/(swhid)/raw/`. Then to import the revision in the current directory, use:: $ tar -xf path/to/swh_1_rev_*.git.tar $ git clone swh:1:rev:*.git new_repository (replace ``swh:1:rev:*`` with the SWHID of the requested revision) This will create a directory called ``new_repository``, which is a git repository containing the requested objects. :param string swhid: the revision's permanent identifier :query string email: e-mail to notify when the git-bare archive is ready {common_headers} :>json string fetch_url: the url from which to download the archive once it has been cooked (see :http:get:`/api/1/vault/git-bare/(swhid)/raw/`) :>json string progress_message: message describing the cooking task progress :>json number id: the cooking task id :>json string status: the cooking task status (new/pending/done/failed) :>json string swhid: the identifier of the object to cook :statuscode 200: no error :statuscode 404: requested directory did not receive any cooking request yet (in case of GET) or can not be found in the archive (in case of POST) """ swhid = CoreSWHID.from_string(swhid) if swhid.object_type == ObjectType.REVISION: res = _dispatch_cook_progress(request, "git_bare", swhid) res["fetch_url"] = reverse( "api-1-vault-fetch-git-bare", url_args={"swhid": str(swhid)}, request=request, ) return _vault_response(res, add_legacy_items=False) elif swhid.object_type == ObjectType.CONTENT: raise BadInputExc( "Content objects do not need to be cooked, " "use `/api/1/content/raw/` instead." ) elif swhid.object_type == ObjectType.DIRECTORY: raise BadInputExc( "Only revisions can be cooked as 'git-bare' bundles. " "Use `/api/1/vault/flat/` to cook directories, as flat bundles." ) else: raise BadInputExc("Only revisions can be cooked as 'git-bare' bundles.") @api_route( - f"/vault/git-bare/(?P<swhid>{SWHID_RE})/raw/", "api-1-vault-fetch-git-bare", + f"/vault/git-bare/(?P<swhid>{SWHID_RE})/raw/", + "api-1-vault-fetch-git-bare", ) @api_doc("/vault/git-bare/raw/") def api_vault_fetch_revision_git_bare(request, swhid): """ .. http:get:: /api/1/vault/git-bare/(swhid)/raw/ Fetch the cooked git-bare archive for a revision. See :http:get:`/api/1/vault/git-bare/(swhid)/` to get more details on git-bare cooking. :param string swhid: the revision's permanent identifier :resheader Content-Type: application/x-tar :statuscode 200: no error :statuscode 404: requested directory did not receive any cooking request yet (in case of GET) or can not be found in the archive (in case of POST) """ res = api_lookup( archive.vault_fetch, "git_bare", CoreSWHID.from_string(swhid), notfound_msg="Cooked archive for {} not found.".format(swhid), request=request, ) fname = "{}.git.tar".format(swhid) response = HttpResponse(res, content_type="application/x-tar") response["Content-disposition"] = "attachment; filename={}".format( fname.replace(":", "_") ) return response diff --git a/swh/web/auth/mailmap.py b/swh/web/auth/mailmap.py index 6c651c0f..3108f062 100644 --- a/swh/web/auth/mailmap.py +++ b/swh/web/auth/mailmap.py @@ -1,196 +1,204 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json from typing import Any, Dict from django.conf.urls import url from django.core.paginator import Paginator from django.db import IntegrityError from django.db.models import Q from django.http.request import HttpRequest from django.http.response import ( HttpResponse, HttpResponseBadRequest, HttpResponseNotFound, JsonResponse, ) from rest_framework import serializers from rest_framework.decorators import api_view from rest_framework.request import Request from rest_framework.response import Response from swh.web.auth.models import UserMailmap, UserMailmapEvent from swh.web.auth.utils import ( MAILMAP_ADMIN_PERMISSION, MAILMAP_PERMISSION, any_permission_required, ) class UserMailmapSerializer(serializers.ModelSerializer): class Meta: model = UserMailmap fields = "__all__" @api_view(["GET"]) @any_permission_required(MAILMAP_PERMISSION, MAILMAP_ADMIN_PERMISSION) def profile_list_mailmap(request: Request) -> HttpResponse: mailmap_admin = request.user.has_perm(MAILMAP_ADMIN_PERMISSION) mms = UserMailmap.objects.filter( user_id=None if mailmap_admin else str(request.user.id) ).all() return Response(UserMailmapSerializer(mms, many=True).data) @api_view(["POST"]) @any_permission_required(MAILMAP_PERMISSION, MAILMAP_ADMIN_PERMISSION) def profile_add_mailmap(request: Request) -> HttpResponse: mailmap_admin = request.user.has_perm(MAILMAP_ADMIN_PERMISSION) event = UserMailmapEvent.objects.create( user_id=str(request.user.id), request_type="add", request=json.dumps(request.data), ) from_email = request.data.pop("from_email", None) if not from_email: return HttpResponseBadRequest( "'from_email' must be provided and non-empty.", content_type="text/plain" ) user_id = None if mailmap_admin else str(request.user.id) from_email_verified = request.data.pop("from_email_verified", False) if mailmap_admin: # consider email verified when mailmap is added by admin from_email_verified = True try: UserMailmap.objects.create( user_id=user_id, from_email=from_email, from_email_verified=from_email_verified, **request.data, ) except IntegrityError as e: if ( "user_mailmap_from_email_key" in e.args[0] or "user_mailmap.from_email" in e.args[0] ): return HttpResponseBadRequest( "This 'from_email' already exists.", content_type="text/plain" ) else: raise event.successful = True event.save() mm = UserMailmap.objects.get(user_id=user_id, from_email=from_email) return Response(UserMailmapSerializer(mm).data) @api_view(["POST"]) @any_permission_required(MAILMAP_PERMISSION, MAILMAP_ADMIN_PERMISSION) def profile_update_mailmap(request: Request) -> HttpResponse: mailmap_admin = request.user.has_perm(MAILMAP_ADMIN_PERMISSION) event = UserMailmapEvent.objects.create( user_id=str(request.user.id), request_type="update", request=json.dumps(request.data), ) from_email = request.data.pop("from_email", None) if not from_email: return HttpResponseBadRequest( "'from_email' must be provided and non-empty.", content_type="text/plain" ) user_id = None if mailmap_admin else str(request.user.id) try: to_update = ( UserMailmap.objects.filter(user_id=user_id) .filter(from_email=from_email) .get() ) except UserMailmap.DoesNotExist: return HttpResponseNotFound("'from_email' cannot be found in mailmaps.") for attr, value in request.data.items(): setattr(to_update, attr, value) to_update.save() event.successful = True event.save() mm = UserMailmap.objects.get(user_id=user_id, from_email=from_email) return Response(UserMailmapSerializer(mm).data) @any_permission_required(MAILMAP_PERMISSION, MAILMAP_ADMIN_PERMISSION) def profile_list_mailmap_datatables(request: HttpRequest) -> HttpResponse: mailmap_admin = request.user.has_perm(MAILMAP_ADMIN_PERMISSION) mailmaps = UserMailmap.objects.filter( user_id=None if mailmap_admin else str(request.user.id) ) search_value = request.GET.get("search[value]", "") column_order = request.GET.get("order[0][column]") field_order = request.GET.get(f"columns[{column_order}][name]", "from_email") order_dir = request.GET.get("order[0][dir]", "asc") if order_dir == "desc": field_order = "-" + field_order mailmaps = mailmaps.order_by(field_order) table_data: Dict[str, Any] = {} table_data["draw"] = int(request.GET.get("draw", 1)) table_data["recordsTotal"] = mailmaps.count() length = int(request.GET.get("length", 10)) page = int(request.GET.get("start", 0)) / length + 1 if search_value: mailmaps = mailmaps.filter( Q(from_email__icontains=search_value) | Q(display_name__icontains=search_value) ) table_data["recordsFiltered"] = mailmaps.count() paginator = Paginator(mailmaps, length) mailmaps_data = [ UserMailmapSerializer(mm).data for mm in paginator.page(int(page)).object_list ] table_data["data"] = mailmaps_data return JsonResponse(table_data) urlpatterns = [ - url(r"^profile/mailmap/list/$", profile_list_mailmap, name="profile-mailmap-list",), - url(r"^profile/mailmap/add/$", profile_add_mailmap, name="profile-mailmap-add",), + url( + r"^profile/mailmap/list/$", + profile_list_mailmap, + name="profile-mailmap-list", + ), + url( + r"^profile/mailmap/add/$", + profile_add_mailmap, + name="profile-mailmap-add", + ), url( r"^profile/mailmap/update/$", profile_update_mailmap, name="profile-mailmap-update", ), url( r"^profile/mailmap/list/datatables/$", profile_list_mailmap_datatables, name="profile-mailmap-list-datatables", ), ] diff --git a/swh/web/auth/migrations/0001_initial.py b/swh/web/auth/migrations/0001_initial.py index fee14893..a1a2c962 100644 --- a/swh/web/auth/migrations/0001_initial.py +++ b/swh/web/auth/migrations/0001_initial.py @@ -1,44 +1,52 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import django.contrib.auth.models from django.db import migrations, models class Migration(migrations.Migration): initial = True dependencies = [ ("auth", "0011_update_proxy_permissions"), ] operations = [ migrations.CreateModel( name="OIDCUserOfflineTokens", fields=[ ( "id", models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ("user_id", models.CharField(max_length=50)), ("creation_date", models.DateTimeField(auto_now_add=True)), ("offline_token", models.BinaryField()), ], - options={"db_table": "oidc_user_offline_tokens",}, + options={ + "db_table": "oidc_user_offline_tokens", + }, ), migrations.CreateModel( name="OIDCUser", fields=[], - options={"proxy": True, "indexes": [], "constraints": [],}, + options={ + "proxy": True, + "indexes": [], + "constraints": [], + }, bases=("auth.user",), - managers=[("objects", django.contrib.auth.models.UserManager()),], + managers=[ + ("objects", django.contrib.auth.models.UserManager()), + ], ), ] diff --git a/swh/web/auth/migrations/0003_delete_oidcuser.py b/swh/web/auth/migrations/0003_delete_oidcuser.py index 458c840f..2fe1de3c 100644 --- a/swh/web/auth/migrations/0003_delete_oidcuser.py +++ b/swh/web/auth/migrations/0003_delete_oidcuser.py @@ -1,14 +1,16 @@ # Generated by Django 2.2.19 on 2021-03-22 15:41 from django.db import migrations class Migration(migrations.Migration): dependencies = [ ("swh_web_auth", "0002_remove_stored_tokens"), ] operations = [ - migrations.DeleteModel(name="OIDCUser",), + migrations.DeleteModel( + name="OIDCUser", + ), ] diff --git a/swh/web/auth/migrations/0004_usermailmap.py b/swh/web/auth/migrations/0004_usermailmap.py index 86f25499..65333491 100644 --- a/swh/web/auth/migrations/0004_usermailmap.py +++ b/swh/web/auth/migrations/0004_usermailmap.py @@ -1,45 +1,47 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("swh_web_auth", "0003_delete_oidcuser"), ] operations = [ migrations.CreateModel( name="UserMailmap", fields=[ ( "id", models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ("user_id", models.CharField(max_length=50, null=True)), ("from_email", models.TextField(unique=True, null=False)), ("from_email_verified", models.BooleanField(default=False)), ( "from_email_verification_request_date", models.DateTimeField(null=True), ), ("display_name", models.TextField(null=False)), ("display_name_activated", models.BooleanField(default=False)), ("to_email", models.TextField(null=True)), ("to_email_verified", models.BooleanField(default=False)), ("to_email_verification_request_date", models.DateTimeField(null=True)), ("mailmap_last_processing_date", models.DateTimeField(null=True)), ("last_update_date", models.DateTimeField(auto_now=True)), ], - options={"db_table": "user_mailmap",}, + options={ + "db_table": "user_mailmap", + }, ), ] diff --git a/swh/web/auth/migrations/0005_usermailmapevent.py b/swh/web/auth/migrations/0005_usermailmapevent.py index d256eff8..052cd8c2 100644 --- a/swh/web/auth/migrations/0005_usermailmapevent.py +++ b/swh/web/auth/migrations/0005_usermailmapevent.py @@ -1,39 +1,41 @@ # Generated by Django 2.2.27 on 2022-02-07 16:02 from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("swh_web_auth", "0004_usermailmap"), ] operations = [ migrations.CreateModel( name="UserMailmapEvent", fields=[ ( "id", models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ("timestamp", models.DateTimeField(auto_now=True)), ("user_id", models.CharField(max_length=50)), ("request_type", models.CharField(max_length=50)), ("request", models.TextField()), ("successful", models.BooleanField(default=False)), ], - options={"db_table": "user_mailmap_event",}, + options={ + "db_table": "user_mailmap_event", + }, ), migrations.AddIndex( model_name="usermailmapevent", index=models.Index( fields=["timestamp"], name="user_mailma_timesta_1f7aef_idx" ), ), ] diff --git a/swh/web/auth/views.py b/swh/web/auth/views.py index 28dc14e7..2cf06021 100644 --- a/swh/web/auth/views.py +++ b/swh/web/auth/views.py @@ -1,193 +1,197 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json from typing import Any, Dict, Union, cast from cryptography.fernet import InvalidToken from django.conf.urls import url from django.contrib.auth.decorators import login_required from django.core.paginator import Paginator from django.http import HttpRequest from django.http.response import ( HttpResponse, HttpResponseBadRequest, HttpResponseForbidden, HttpResponseRedirect, JsonResponse, ) from django.shortcuts import render from django.views.decorators.http import require_http_methods from swh.auth.django.models import OIDCUser from swh.auth.django.utils import keycloak_oidc_client from swh.auth.django.views import get_oidc_login_data, oidc_login_view from swh.auth.django.views import urlpatterns as auth_urlpatterns from swh.auth.keycloak import KeycloakError, keycloak_error_message from swh.web.auth.models import OIDCUserOfflineTokens from swh.web.auth.utils import decrypt_data, encrypt_data from swh.web.common.exc import ForbiddenExc from swh.web.common.utils import reverse from swh.web.config import get_config from .mailmap import urlpatterns as mailmap_urlpatterns def oidc_generate_bearer_token(request: HttpRequest) -> HttpResponse: if not request.user.is_authenticated or not isinstance(request.user, OIDCUser): return HttpResponseForbidden() redirect_uri = reverse("oidc-generate-bearer-token-complete", request=request) return oidc_login_view( request, redirect_uri=redirect_uri, scope="openid offline_access" ) def oidc_generate_bearer_token_complete(request: HttpRequest) -> HttpResponse: if not request.user.is_authenticated or not isinstance(request.user, OIDCUser): raise ForbiddenExc("You are not allowed to generate bearer tokens.") if "error" in request.GET: raise Exception(request.GET["error"]) login_data = get_oidc_login_data(request) oidc_client = keycloak_oidc_client() oidc_profile = oidc_client.authorization_code( code=request.GET["code"], code_verifier=login_data["code_verifier"], redirect_uri=login_data["redirect_uri"], ) user = cast(OIDCUser, request.user) token = oidc_profile["refresh_token"] secret = get_config()["secret_key"].encode() salt = user.sub.encode() encrypted_token = encrypt_data(token.encode(), secret, salt) OIDCUserOfflineTokens.objects.create( user_id=str(user.id), offline_token=encrypted_token ).save() return HttpResponseRedirect(reverse("oidc-profile") + "#tokens") def oidc_list_bearer_tokens(request: HttpRequest) -> HttpResponse: if not request.user.is_authenticated or not isinstance(request.user, OIDCUser): return HttpResponseForbidden() tokens = OIDCUserOfflineTokens.objects.filter(user_id=str(request.user.id)) tokens = tokens.order_by("-creation_date") length = int(request.GET["length"]) page = int(request.GET["start"]) / length + 1 paginator = Paginator(tokens, length) tokens_data = [ {"id": t.id, "creation_date": t.creation_date.isoformat()} for t in paginator.page(int(page)).object_list ] table_data: Dict[str, Any] = {} table_data["recordsTotal"] = len(tokens_data) table_data["draw"] = int(request.GET["draw"]) table_data["data"] = tokens_data table_data["recordsFiltered"] = len(tokens_data) return JsonResponse(table_data) def _encrypted_token_bytes(token: Union[bytes, memoryview]) -> bytes: # token has been retrieved from a PosgreSQL database if isinstance(token, memoryview): return token.tobytes() else: return token @require_http_methods(["POST"]) def oidc_get_bearer_token(request: HttpRequest) -> HttpResponse: if not request.user.is_authenticated or not isinstance(request.user, OIDCUser): return HttpResponseForbidden() try: data = json.loads(request.body.decode("ascii")) user = cast(OIDCUser, request.user) token_data = OIDCUserOfflineTokens.objects.get(id=data["token_id"]) secret = get_config()["secret_key"].encode() salt = user.sub.encode() decrypted_token = decrypt_data( _encrypted_token_bytes(token_data.offline_token), secret, salt ) refresh_token = decrypted_token.decode("ascii") # check token is still valid oidc_client = keycloak_oidc_client() oidc_client.refresh_token(refresh_token) return HttpResponse(refresh_token, content_type="text/plain") except InvalidToken: return HttpResponse(status=401) except KeycloakError as ke: error_msg = keycloak_error_message(ke) if error_msg in ( "invalid_grant: Offline session not active", "invalid_grant: Offline user session not found", ): error_msg = "Bearer token has expired, please generate a new one." return HttpResponseBadRequest(error_msg, content_type="text/plain") @require_http_methods(["POST"]) def oidc_revoke_bearer_tokens(request: HttpRequest) -> HttpResponse: if not request.user.is_authenticated or not isinstance(request.user, OIDCUser): return HttpResponseForbidden() try: data = json.loads(request.body.decode("ascii")) user = cast(OIDCUser, request.user) for token_id in data["token_ids"]: token_data = OIDCUserOfflineTokens.objects.get(id=token_id) secret = get_config()["secret_key"].encode() salt = user.sub.encode() decrypted_token = decrypt_data( _encrypted_token_bytes(token_data.offline_token), secret, salt ) oidc_client = keycloak_oidc_client() oidc_client.logout(decrypted_token.decode("ascii")) token_data.delete() return HttpResponse(status=200) except InvalidToken: return HttpResponse(status=401) @login_required(login_url="/oidc/login/", redirect_field_name="next_path") def _oidc_profile_view(request: HttpRequest) -> HttpResponse: return render(request, "auth/profile.html") urlpatterns = ( auth_urlpatterns + [ url( r"^oidc/generate-bearer-token/$", oidc_generate_bearer_token, name="oidc-generate-bearer-token", ), url( r"^oidc/generate-bearer-token-complete/$", oidc_generate_bearer_token_complete, name="oidc-generate-bearer-token-complete", ), url( r"^oidc/list-bearer-token/$", oidc_list_bearer_tokens, name="oidc-list-bearer-tokens", ), url( r"^oidc/get-bearer-token/$", oidc_get_bearer_token, name="oidc-get-bearer-token", ), url( r"^oidc/revoke-bearer-tokens/$", oidc_revoke_bearer_tokens, name="oidc-revoke-bearer-tokens", ), - url(r"^oidc/profile/$", _oidc_profile_view, name="oidc-profile",), + url( + r"^oidc/profile/$", + _oidc_profile_view, + name="oidc-profile", + ), ] + mailmap_urlpatterns ) diff --git a/swh/web/browse/snapshot_context.py b/swh/web/browse/snapshot_context.py index c21531df..9bc040b6 100644 --- a/swh/web/browse/snapshot_context.py +++ b/swh/web/browse/snapshot_context.py @@ -1,1322 +1,1330 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information # Utility module for browsing the archive in a snapshot context. from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple from django.shortcuts import render from django.utils.html import escape from swh.model.hashutil import hash_to_bytes from swh.model.model import Snapshot from swh.model.swhids import CoreSWHID, ObjectType from swh.web.browse.utils import ( format_log_entries, gen_release_link, gen_revision_link, gen_revision_log_link, gen_revision_url, gen_snapshot_link, get_directory_entries, get_readme_to_display, ) from swh.web.common import archive from swh.web.common.exc import BadInputExc, NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhids_info from swh.web.common.origin_visits import get_origin_visit from swh.web.common.typing import ( DirectoryMetadata, OriginInfo, SnapshotBranchInfo, SnapshotContext, SnapshotReleaseInfo, SWHObjectInfo, ) from swh.web.common.utils import ( django_cache, format_utc_iso_date, gen_path_info, reverse, swh_object_icons, ) from swh.web.config import get_config _empty_snapshot_id = Snapshot(branches={}).id.hex() def _get_branch(branches, branch_name, snapshot_id): """ Utility function to get a specific branch from a snapshot. Returns None if the branch cannot be found. """ filtered_branches = [b for b in branches if b["name"] == branch_name] if filtered_branches: return filtered_branches[0] else: # case where a large branches list has been truncated snp = archive.lookup_snapshot( snapshot_id, branches_from=branch_name, branches_count=1, target_types=["revision", "alias"], # pull request branches must be browsable even if they are hidden # by default in branches list branch_name_exclude_prefix=None, ) snp_branch, _, _ = process_snapshot_branches(snp) if snp_branch and snp_branch[0]["name"] == branch_name: branches.append(snp_branch[0]) return snp_branch[0] def _get_release(releases, release_name, snapshot_id): """ Utility function to get a specific release from a snapshot. Returns None if the release cannot be found. """ filtered_releases = [r for r in releases if r["name"] == release_name] if filtered_releases: return filtered_releases[0] else: # case where a large branches list has been truncated try: # git origins have specific branches for releases snp = archive.lookup_snapshot( snapshot_id, branches_from=f"refs/tags/{release_name}", branches_count=1, target_types=["release"], ) except NotFoundExc: snp = archive.lookup_snapshot( snapshot_id, branches_from=release_name, branches_count=1, target_types=["release", "alias"], ) _, snp_release, _ = process_snapshot_branches(snp) if snp_release and snp_release[0]["name"] == release_name: releases.append(snp_release[0]) return snp_release[0] def _branch_not_found( branch_type, branch, snapshot_id, snapshot_sizes, origin_info, timestamp, visit_id ): """ Utility function to raise an exception when a specified branch/release can not be found. """ if branch_type == "branch": branch_type = "Branch" branch_type_plural = "branches" target_type = "revision" else: branch_type = "Release" branch_type_plural = "releases" target_type = "release" if snapshot_id and snapshot_sizes[target_type] == 0: msg = "Snapshot with id %s has an empty list" " of %s!" % ( snapshot_id, branch_type_plural, ) elif snapshot_id: msg = "%s %s for snapshot with id %s" " not found!" % ( branch_type, branch, snapshot_id, ) elif visit_id and snapshot_sizes[target_type] == 0: msg = ( "Origin with url %s" " for visit with id %s has an empty list" " of %s!" % (origin_info["url"], visit_id, branch_type_plural) ) elif visit_id: msg = ( "%s %s associated to visit with" " id %s for origin with url %s" " not found!" % (branch_type, branch, visit_id, origin_info["url"]) ) elif snapshot_sizes[target_type] == 0: msg = ( "Origin with url %s" " for visit with timestamp %s has an empty list" " of %s!" % (origin_info["url"], timestamp, branch_type_plural) ) else: msg = ( "%s %s associated to visit with" " timestamp %s for origin with " "url %s not found!" % (branch_type, branch, timestamp, origin_info["url"]) ) raise NotFoundExc(escape(msg)) def process_snapshot_branches( snapshot: Dict[str, Any] ) -> Tuple[List[SnapshotBranchInfo], List[SnapshotReleaseInfo], Dict[str, Any]]: """ Process a dictionary describing snapshot branches: extract those targeting revisions and releases, put them in two different lists, then sort those lists in lexicographical order of the branches' names. Args: snapshot: A dict describing a snapshot as returned for instance by :func:`swh.web.common.archive.lookup_snapshot` Returns: A tuple whose first member is the sorted list of branches targeting revisions, second member the sorted list of branches targeting releases and third member a dict mapping resolved branch aliases to their real target. """ snapshot_branches = snapshot["branches"] branches: Dict[str, SnapshotBranchInfo] = {} branch_aliases: Dict[str, str] = {} releases: Dict[str, SnapshotReleaseInfo] = {} revision_to_branch = defaultdict(set) revision_to_release = defaultdict(set) release_to_branch = defaultdict(set) for branch_name, target in snapshot_branches.items(): if not target: # FIXME: display branches with an unknown target anyway continue target_id = target["target"] target_type = target["target_type"] if target_type == "revision": branches[branch_name] = SnapshotBranchInfo( name=branch_name, alias=False, revision=target_id, date=None, directory=None, message=None, url=None, ) revision_to_branch[target_id].add(branch_name) elif target_type == "release": release_to_branch[target_id].add(branch_name) elif target_type == "alias": branch_aliases[branch_name] = target_id # FIXME: handle pointers to other object types def _add_release_info(branch, release, alias=False): releases[branch] = SnapshotReleaseInfo( name=release["name"], alias=alias, branch_name=branch, date=format_utc_iso_date(release["date"]), directory=None, id=release["id"], message=release["message"], target_type=release["target_type"], target=release["target"], url=None, ) def _add_branch_info(branch, revision, alias=False): branches[branch] = SnapshotBranchInfo( name=branch, alias=alias, revision=revision["id"], directory=revision["directory"], date=format_utc_iso_date(revision["date"]), message=revision["message"], url=None, ) releases_info = archive.lookup_release_multiple(release_to_branch.keys()) for release in releases_info: if release is None: continue branches_to_update = release_to_branch[release["id"]] for branch in branches_to_update: _add_release_info(branch, release) if release["target_type"] == "revision": revision_to_release[release["target"]].update(branches_to_update) revisions = archive.lookup_revision_multiple( set(revision_to_branch.keys()) | set(revision_to_release.keys()) ) for revision in revisions: if not revision: continue for branch in revision_to_branch[revision["id"]]: _add_branch_info(branch, revision) for release_id in revision_to_release[revision["id"]]: releases[release_id]["directory"] = revision["directory"] resolved_aliases = {} for branch_alias, branch_target in branch_aliases.items(): resolved_alias = archive.lookup_snapshot_alias(snapshot["id"], branch_alias) resolved_aliases[branch_alias] = resolved_alias if resolved_alias is None: continue target_type = resolved_alias["target_type"] target = resolved_alias["target"] if target_type == "revision": revision = archive.lookup_revision(target) _add_branch_info(branch_alias, revision, alias=True) elif target_type == "release": release = archive.lookup_release(target) _add_release_info(branch_alias, release, alias=True) if branch_alias in branches: branches[branch_alias]["name"] = branch_alias ret_branches = list(sorted(branches.values(), key=lambda b: b["name"])) ret_releases = list(sorted(releases.values(), key=lambda b: b["name"])) return ret_branches, ret_releases, resolved_aliases @django_cache() def get_snapshot_content( snapshot_id: str, ) -> Tuple[List[SnapshotBranchInfo], List[SnapshotReleaseInfo], Dict[str, Any]]: """Returns the lists of branches and releases associated to a swh snapshot. That list is put in cache in order to speedup the navigation in the swh-web/browse ui. .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. Args: snapshot_id: hexadecimal representation of the snapshot identifier Returns: A tuple with three members. The first one is a list of dict describing the snapshot branches. The second one is a list of dict describing the snapshot releases. The third one is a dict mapping resolved branch aliases to their real target. Raises: NotFoundExc if the snapshot does not exist """ branches: List[SnapshotBranchInfo] = [] releases: List[SnapshotReleaseInfo] = [] aliases: Dict[str, Any] = {} snapshot_content_max_size = get_config()["snapshot_content_max_size"] if snapshot_id: snapshot = archive.lookup_snapshot( snapshot_id, branches_count=snapshot_content_max_size ) branches, releases, aliases = process_snapshot_branches(snapshot) return branches, releases, aliases def get_origin_visit_snapshot( origin_info: OriginInfo, visit_ts: Optional[str] = None, visit_id: Optional[int] = None, snapshot_id: Optional[str] = None, ) -> Tuple[List[SnapshotBranchInfo], List[SnapshotReleaseInfo], Dict[str, Any]]: """Returns the lists of branches and releases associated to an origin for a given visit. The visit is expressed by either: * a snapshot identifier * a timestamp, if no visit with that exact timestamp is found, the closest one from the provided timestamp will be used. If no visit parameter is provided, it returns the list of branches found for the latest visit. That list is put in cache in order to speedup the navigation in the swh-web/browse ui. .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. Args: origin_info: a dict filled with origin information visit_ts: an ISO 8601 datetime string to parse visit_id: visit id for disambiguation in case several visits have the same timestamp snapshot_id: if provided, visit associated to the snapshot will be processed Returns: A tuple with three members. The first one is a list of dict describing the origin branches for the given visit. The second one is a list of dict describing the origin releases for the given visit. The third one is a dict mapping resolved branch aliases to their real target. Raises: NotFoundExc if the origin or its visit are not found """ visit_info = get_origin_visit(origin_info, visit_ts, visit_id, snapshot_id) return get_snapshot_content(visit_info["snapshot"]) def get_snapshot_context( snapshot_id: Optional[str] = None, origin_url: Optional[str] = None, timestamp: Optional[str] = None, visit_id: Optional[int] = None, branch_name: Optional[str] = None, release_name: Optional[str] = None, revision_id: Optional[str] = None, path: Optional[str] = None, browse_context: str = "directory", ) -> SnapshotContext: """ Utility function to compute relevant information when navigating the archive in a snapshot context. The snapshot is either referenced by its id or it will be retrieved from an origin visit. Args: snapshot_id: hexadecimal representation of a snapshot identifier origin_url: an origin_url timestamp: a datetime string for retrieving the closest visit of the origin visit_id: optional visit id for disambiguation in case of several visits with the same timestamp branch_name: optional branch name set when browsing the snapshot in that scope (will default to "HEAD" if not provided) release_name: optional release name set when browsing the snapshot in that scope revision_id: optional revision identifier set when browsing the snapshot in that scope path: optional path of the object currently browsed in the snapshot browse_context: indicates which type of object is currently browsed Returns: A dict filled with snapshot context information. Raises: swh.web.common.exc.NotFoundExc: if no snapshot is found for the visit of an origin. """ assert origin_url is not None or snapshot_id is not None origin_info = None visit_info = None url_args = {} query_params: Dict[str, Any] = {} origin_visits_url = None if origin_url: if visit_id is not None: query_params["visit_id"] = visit_id elif snapshot_id is not None: query_params["snapshot"] = snapshot_id origin_info = archive.lookup_origin({"url": origin_url}) visit_info = get_origin_visit(origin_info, timestamp, visit_id, snapshot_id) formatted_date = format_utc_iso_date(visit_info["date"]) visit_info["formatted_date"] = formatted_date snapshot_id = visit_info["snapshot"] if not snapshot_id: raise NotFoundExc( "No snapshot associated to the visit of origin " "%s on %s" % (escape(origin_url), formatted_date) ) # provided timestamp is not necessarily equals to the one # of the retrieved visit, so get the exact one in order # to use it in the urls generated below if timestamp: timestamp = visit_info["date"] branches, releases, aliases = get_origin_visit_snapshot( origin_info, timestamp, visit_id, snapshot_id ) query_params["origin_url"] = origin_info["url"] origin_visits_url = reverse( "browse-origin-visits", query_params={"origin_url": origin_info["url"]} ) if timestamp is not None: query_params["timestamp"] = format_utc_iso_date( timestamp, "%Y-%m-%dT%H:%M:%SZ" ) visit_url = reverse("browse-origin-directory", query_params=query_params) visit_info["url"] = directory_url = visit_url branches_url = reverse("browse-origin-branches", query_params=query_params) releases_url = reverse("browse-origin-releases", query_params=query_params) else: assert snapshot_id is not None branches, releases, aliases = get_snapshot_content(snapshot_id) url_args = {"snapshot_id": snapshot_id} directory_url = reverse("browse-snapshot-directory", url_args=url_args) branches_url = reverse("browse-snapshot-branches", url_args=url_args) releases_url = reverse("browse-snapshot-releases", url_args=url_args) releases = list(reversed(releases)) @django_cache() def _get_snapshot_sizes(snapshot_id): return archive.lookup_snapshot_sizes(snapshot_id) snapshot_sizes = _get_snapshot_sizes(snapshot_id) is_empty = (snapshot_sizes["release"] + snapshot_sizes["revision"]) == 0 swh_snp_id = str( CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=hash_to_bytes(snapshot_id)) ) if visit_info: timestamp = format_utc_iso_date(visit_info["date"]) if origin_info: browse_view_name = f"browse-origin-{browse_context}" else: browse_view_name = f"browse-snapshot-{browse_context}" release_id = None root_directory = None snapshot_total_size = snapshot_sizes["release"] + snapshot_sizes["revision"] if path is not None: query_params["path"] = path if snapshot_total_size and revision_id is not None: # browse specific revision for a snapshot requested revision = archive.lookup_revision(revision_id) root_directory = revision["directory"] branches.append( SnapshotBranchInfo( name=revision_id, alias=False, revision=revision_id, directory=root_directory, date=revision["date"], message=revision["message"], url=None, ) ) query_params["revision"] = revision_id elif snapshot_total_size and release_name: # browse specific release for a snapshot requested release = _get_release(releases, release_name, snapshot_id) if release is None: _branch_not_found( "release", release_name, snapshot_id, snapshot_sizes, origin_info, timestamp, visit_id, ) else: if release["target_type"] == "revision": revision = archive.lookup_revision(release["target"]) root_directory = revision["directory"] revision_id = release["target"] elif release["target_type"] == "directory": root_directory = release["target"] release_id = release["id"] query_params["release"] = release_name elif snapshot_total_size: head = aliases.get("HEAD") if branch_name: # browse specific branch for a snapshot requested query_params["branch"] = branch_name branch = _get_branch(branches, branch_name, snapshot_id) if branch is None: _branch_not_found( "branch", branch_name, snapshot_id, snapshot_sizes, origin_info, timestamp, visit_id, ) else: branch_name = branch["name"] revision_id = branch["revision"] root_directory = branch["directory"] elif head is not None: # otherwise, browse branch targeted by the HEAD alias if it exists if head["target_type"] == "revision": # HEAD alias targets a revision head_rev = archive.lookup_revision(head["target"]) branch_name = "HEAD" revision_id = head_rev["id"] root_directory = head_rev["directory"] else: # HEAD alias targets a release release_name = archive.lookup_release(head["target"])["name"] head_rel = _get_release(releases, release_name, snapshot_id) if head_rel["target_type"] == "revision": revision = archive.lookup_revision(head_rel["target"]) root_directory = revision["directory"] revision_id = head_rel["target"] elif head_rel["target_type"] == "directory": root_directory = head_rel["target"] release_id = head_rel["id"] elif branches: # fallback to browse first branch otherwise branch = branches[0] branch_name = branch["name"] revision_id = branch["revision"] root_directory = branch["directory"] elif releases: # fallback to browse last release otherwise release = releases[-1] if release["target_type"] == "revision": revision = archive.lookup_revision(release["target"]) root_directory = revision["directory"] revision_id = release["target"] elif release["target_type"] == "directory": root_directory = release["target"] release_id = release["id"] release_name = release["name"] for b in branches: branch_query_params = dict(query_params) branch_query_params.pop("release", None) if b["name"] != b["revision"]: branch_query_params.pop("revision", None) branch_query_params["branch"] = b["name"] b["url"] = reverse( browse_view_name, url_args=url_args, query_params=branch_query_params ) for r in releases: release_query_params = dict(query_params) release_query_params.pop("branch", None) release_query_params.pop("revision", None) release_query_params["release"] = r["name"] r["url"] = reverse( - browse_view_name, url_args=url_args, query_params=release_query_params, + browse_view_name, + url_args=url_args, + query_params=release_query_params, ) revision_info = None if revision_id: try: revision_info = archive.lookup_revision(revision_id) except NotFoundExc: pass else: revision_info["date"] = format_utc_iso_date(revision_info["date"]) revision_info["committer_date"] = format_utc_iso_date( revision_info["committer_date"] ) if revision_info["message"]: message_lines = revision_info["message"].split("\n") revision_info["message_header"] = message_lines[0] else: revision_info["message_header"] = "" snapshot_context = SnapshotContext( directory_url=directory_url, branch=branch_name, branch_alias=branch_name in aliases, branches=branches, branches_url=branches_url, is_empty=is_empty, origin_info=origin_info, origin_visits_url=origin_visits_url, release=release_name, release_alias=release_name in aliases, release_id=release_id, query_params=query_params, releases=releases, releases_url=releases_url, revision_id=revision_id, revision_info=revision_info, root_directory=root_directory, snapshot_id=snapshot_id, snapshot_sizes=snapshot_sizes, snapshot_swhid=swh_snp_id, url_args=url_args, visit_info=visit_info, ) if revision_info: revision_info["revision_url"] = gen_revision_url(revision_id, snapshot_context) return snapshot_context def _build_breadcrumbs(snapshot_context: SnapshotContext, path: str): origin_info = snapshot_context["origin_info"] url_args = snapshot_context["url_args"] query_params = dict(snapshot_context["query_params"]) root_directory = snapshot_context["root_directory"] path_info = gen_path_info(path) if origin_info: browse_view_name = "browse-origin-directory" else: browse_view_name = "browse-snapshot-directory" breadcrumbs = [] if root_directory: query_params.pop("path", None) breadcrumbs.append( { "name": root_directory[:7], "url": reverse( browse_view_name, url_args=url_args, query_params=query_params ), } ) for pi in path_info: query_params["path"] = pi["path"] breadcrumbs.append( { "name": pi["name"], "url": reverse( browse_view_name, url_args=url_args, query_params=query_params ), } ) return breadcrumbs def _check_origin_url(snapshot_id, origin_url): if snapshot_id is None and origin_url is None: raise BadInputExc("An origin URL must be provided as query parameter.") def browse_snapshot_directory( request, snapshot_id=None, origin_url=None, timestamp=None, path=None ): """ Django view implementation for browsing a directory in a snapshot context. """ _check_origin_url(snapshot_id, origin_url) visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=timestamp, visit_id=visit_id or None, path=path, browse_context="directory", branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=request.GET.get("revision"), ) root_directory = snapshot_context["root_directory"] sha1_git = root_directory error_info = { "status_code": 200, "description": None, } if root_directory and path: try: dir_info = archive.lookup_directory_with_path(root_directory, path) sha1_git = dir_info["target"] except NotFoundExc as e: sha1_git = None error_info["status_code"] = 404 error_info["description"] = f"NotFoundExc: {str(e)}" dirs = [] files = [] if sha1_git: dirs, files = get_directory_entries(sha1_git) origin_info = snapshot_context["origin_info"] visit_info = snapshot_context["visit_info"] url_args = snapshot_context["url_args"] query_params = dict(snapshot_context["query_params"]) revision_id = snapshot_context["revision_id"] snapshot_id = snapshot_context["snapshot_id"] if origin_info: browse_view_name = "browse-origin-directory" else: browse_view_name = "browse-snapshot-directory" breadcrumbs = _build_breadcrumbs(snapshot_context, path) path = "" if path is None else (path + "/") for d in dirs: if d["type"] == "rev": d["url"] = reverse("browse-revision", url_args={"sha1_git": d["target"]}) else: query_params["path"] = path + d["name"] d["url"] = reverse( browse_view_name, url_args=url_args, query_params=query_params ) sum_file_sizes = 0 readmes = {} if origin_info: browse_view_name = "browse-origin-content" else: browse_view_name = "browse-snapshot-content" for f in files: query_params["path"] = path + f["name"] f["url"] = reverse( browse_view_name, url_args=url_args, query_params=query_params ) if f["length"] is not None: sum_file_sizes += f["length"] if f["name"].lower().startswith("readme"): readmes[f["name"]] = f["checksums"]["sha1"] readme_name, readme_url, readme_html = get_readme_to_display(readmes) if origin_info: browse_view_name = "browse-origin-log" else: browse_view_name = "browse-snapshot-log" history_url = None if snapshot_id != _empty_snapshot_id: query_params.pop("path", None) history_url = reverse( browse_view_name, url_args=url_args, query_params=query_params ) nb_files = None nb_dirs = None dir_path = None if root_directory: nb_files = len(files) nb_dirs = len(dirs) dir_path = "/" + path swh_objects = [] vault_cooking = {} revision_found = False if revision_id is not None: try: archive.lookup_revision(revision_id) except NotFoundExc: pass else: revision_found = True if sha1_git is not None: swh_objects.append( SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=sha1_git) ) vault_cooking.update( - {"directory_context": True, "directory_swhid": f"swh:1:dir:{sha1_git}",} + { + "directory_context": True, + "directory_swhid": f"swh:1:dir:{sha1_git}", + } ) if revision_id is not None and revision_found: swh_objects.append( SWHObjectInfo(object_type=ObjectType.REVISION, object_id=revision_id) ) vault_cooking.update( - {"revision_context": True, "revision_swhid": f"swh:1:rev:{revision_id}",} + { + "revision_context": True, + "revision_swhid": f"swh:1:rev:{revision_id}", + } ) swh_objects.append( SWHObjectInfo(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id) ) visit_date = None visit_type = None if visit_info: visit_date = format_utc_iso_date(visit_info["date"]) visit_type = visit_info["type"] release_id = snapshot_context["release_id"] if release_id: swh_objects.append( SWHObjectInfo(object_type=ObjectType.RELEASE, object_id=release_id) ) dir_metadata = DirectoryMetadata( object_type=ObjectType.DIRECTORY, object_id=sha1_git, directory=sha1_git, nb_files=nb_files, nb_dirs=nb_dirs, sum_file_sizes=sum_file_sizes, root_directory=root_directory, path=dir_path, revision=revision_id, revision_found=revision_found, release=release_id, snapshot=snapshot_id, origin_url=origin_url, visit_date=visit_date, visit_type=visit_type, ) swhids_info = get_swhids_info(swh_objects, snapshot_context, dir_metadata) dir_path = "/".join([bc["name"] for bc in breadcrumbs]) + "/" context_found = "snapshot: %s" % snapshot_context["snapshot_id"] if origin_info: context_found = "origin: %s" % origin_info["url"] heading = "Directory - %s - %s - %s" % ( dir_path, snapshot_context["branch"], context_found, ) top_right_link = None if not snapshot_context["is_empty"] and revision_found: top_right_link = { "url": history_url, "icon": swh_object_icons["revisions history"], "text": "History", } return render( request, "browse/directory.html", { "heading": heading, "swh_object_name": "Directory", "swh_object_metadata": dir_metadata, "dirs": dirs, "files": files, "breadcrumbs": breadcrumbs if root_directory else [], "top_right_link": top_right_link, "readme_name": readme_name, "readme_url": readme_url, "readme_html": readme_html, "snapshot_context": snapshot_context, "vault_cooking": vault_cooking, "show_actions": True, "swhids_info": swhids_info, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], }, status=error_info["status_code"], ) PER_PAGE = 100 def browse_snapshot_log(request, snapshot_id=None, origin_url=None, timestamp=None): """ Django view implementation for browsing a revision history in a snapshot context. """ _check_origin_url(snapshot_id, origin_url) visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=timestamp, visit_id=visit_id or None, browse_context="log", branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=request.GET.get("revision"), ) revision_id = snapshot_context["revision_id"] if revision_id is None: raise NotFoundExc("No revisions history found in the current snapshot context.") per_page = int(request.GET.get("per_page", PER_PAGE)) offset = int(request.GET.get("offset", 0)) revs_ordering = request.GET.get("revs_ordering", "committer_date") session_key = "rev_%s_log_ordering_%s" % (revision_id, revs_ordering) rev_log_session = request.session.get(session_key, None) rev_log = [] revs_walker_state = None if rev_log_session: rev_log = rev_log_session["rev_log"] revs_walker_state = rev_log_session["revs_walker_state"] if len(rev_log) < offset + per_page: revs_walker = archive.get_revisions_walker( revs_ordering, revision_id, max_revs=offset + per_page + 1, state=revs_walker_state, ) rev_log += [rev["id"] for rev in revs_walker] revs_walker_state = revs_walker.export_state() revs = rev_log[offset : offset + per_page] revision_log = archive.lookup_revision_multiple(revs) request.session[session_key] = { "rev_log": rev_log, "revs_walker_state": revs_walker_state, } origin_info = snapshot_context["origin_info"] visit_info = snapshot_context["visit_info"] url_args = snapshot_context["url_args"] query_params = snapshot_context["query_params"] snapshot_id = snapshot_context["snapshot_id"] query_params["per_page"] = per_page revs_ordering = request.GET.get("revs_ordering", "") query_params["revs_ordering"] = revs_ordering or None if origin_info: browse_view_name = "browse-origin-log" else: browse_view_name = "browse-snapshot-log" prev_log_url = None if len(rev_log) > offset + per_page: query_params["offset"] = offset + per_page prev_log_url = reverse( browse_view_name, url_args=url_args, query_params=query_params ) next_log_url = None if offset != 0: query_params["offset"] = offset - per_page next_log_url = reverse( browse_view_name, url_args=url_args, query_params=query_params ) revision_log_data = format_log_entries(revision_log, per_page, snapshot_context) browse_rev_link = gen_revision_link(revision_id) browse_log_link = gen_revision_log_link(revision_id) browse_snp_link = gen_snapshot_link(snapshot_id) revision_metadata = { "context-independent revision": browse_rev_link, "context-independent revision history": browse_log_link, "context-independent snapshot": browse_snp_link, "snapshot": snapshot_id, } if origin_info: revision_metadata["origin url"] = origin_info["url"] revision_metadata["origin visit date"] = format_utc_iso_date(visit_info["date"]) revision_metadata["origin visit type"] = visit_info["type"] swh_objects = [ SWHObjectInfo(object_type=ObjectType.REVISION, object_id=revision_id), SWHObjectInfo(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), ] release_id = snapshot_context["release_id"] if release_id: swh_objects.append( SWHObjectInfo(object_type=ObjectType.RELEASE, object_id=release_id) ) browse_rel_link = gen_release_link(release_id) revision_metadata["release"] = release_id revision_metadata["context-independent release"] = browse_rel_link swhids_info = get_swhids_info(swh_objects, snapshot_context) context_found = "snapshot: %s" % snapshot_context["snapshot_id"] if origin_info: context_found = "origin: %s" % origin_info["url"] heading = "Revision history - %s - %s" % (snapshot_context["branch"], context_found) return render( request, "browse/revision-log.html", { "heading": heading, "swh_object_name": "Revisions history", "swh_object_metadata": revision_metadata, "revision_log": revision_log_data, "revs_ordering": revs_ordering, "next_log_url": next_log_url, "prev_log_url": prev_log_url, "breadcrumbs": None, "top_right_link": None, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": True, "swhids_info": swhids_info, }, ) def browse_snapshot_branches( request, snapshot_id=None, origin_url=None, timestamp=None, branch_name_include=None ): """ Django view implementation for browsing a list of branches in a snapshot context. """ _check_origin_url(snapshot_id, origin_url) visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=timestamp, visit_id=visit_id or None, ) branches_bc = request.GET.get("branches_breadcrumbs", "") branches_bc = branches_bc.split(",") if branches_bc else [] branches_from = branches_bc[-1] if branches_bc else "" origin_info = snapshot_context["origin_info"] url_args = snapshot_context["url_args"] query_params = snapshot_context["query_params"] if origin_info: browse_view_name = "browse-origin-directory" else: browse_view_name = "browse-snapshot-directory" snapshot = archive.lookup_snapshot( snapshot_context["snapshot_id"], branches_from, PER_PAGE + 1, target_types=["revision", "alias"], branch_name_include_substring=branch_name_include, ) displayed_branches = [] if snapshot: displayed_branches, _, _ = process_snapshot_branches(snapshot) for branch in displayed_branches: rev_query_params = {} if origin_info: rev_query_params["origin_url"] = origin_info["url"] revision_url = reverse( "browse-revision", url_args={"sha1_git": branch["revision"]}, query_params=query_params, ) query_params["branch"] = branch["name"] directory_url = reverse( browse_view_name, url_args=url_args, query_params=query_params ) del query_params["branch"] branch["revision_url"] = revision_url branch["directory_url"] = directory_url if origin_info: browse_view_name = "browse-origin-branches" else: browse_view_name = "browse-snapshot-branches" prev_branches_url = None next_branches_url = None if branches_bc: query_params_prev = dict(query_params) query_params_prev["branches_breadcrumbs"] = ",".join(branches_bc[:-1]) prev_branches_url = reverse( browse_view_name, url_args=url_args, query_params=query_params_prev ) elif branches_from: prev_branches_url = reverse( browse_view_name, url_args=url_args, query_params=query_params ) if snapshot and snapshot["next_branch"] is not None: query_params_next = dict(query_params) next_branch = displayed_branches[-1]["name"] del displayed_branches[-1] branches_bc.append(next_branch) query_params_next["branches_breadcrumbs"] = ",".join(branches_bc) next_branches_url = reverse( browse_view_name, url_args=url_args, query_params=query_params_next ) heading = "Branches - " if origin_info: heading += "origin: %s" % origin_info["url"] else: heading += "snapshot: %s" % snapshot_id return render( request, "browse/branches.html", { "heading": heading, "swh_object_name": "Branches", "swh_object_metadata": {}, "top_right_link": None, "displayed_branches": displayed_branches, "prev_branches_url": prev_branches_url, "next_branches_url": next_branches_url, "snapshot_context": snapshot_context, "search_string": branch_name_include or "", }, ) def browse_snapshot_releases( request, snapshot_id=None, origin_url=None, timestamp=None, release_name_include=None, ): """ Django view implementation for browsing a list of releases in a snapshot context. """ _check_origin_url(snapshot_id, origin_url) visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=timestamp, visit_id=visit_id or None, ) rel_bc = request.GET.get("releases_breadcrumbs", "") rel_bc = rel_bc.split(",") if rel_bc else [] rel_from = rel_bc[-1] if rel_bc else "" origin_info = snapshot_context["origin_info"] url_args = snapshot_context["url_args"] query_params = snapshot_context["query_params"] snapshot = archive.lookup_snapshot( snapshot_context["snapshot_id"], rel_from, PER_PAGE + 1, target_types=["release", "alias"], branch_name_include_substring=release_name_include, ) displayed_releases = [] if snapshot: _, displayed_releases, _ = process_snapshot_branches(snapshot) for release in displayed_releases: query_params_tgt = {"snapshot": snapshot_id, "release": release["name"]} if origin_info: query_params_tgt["origin_url"] = origin_info["url"] release_url = reverse( "browse-release", url_args={"sha1_git": release["id"]}, query_params=query_params_tgt, ) target_url = "" tooltip = ( f"The release {release['name']} targets " f"{release['target_type']} {release['target']}" ) if release["target_type"] == "revision": target_url = reverse( "browse-revision", url_args={"sha1_git": release["target"]}, query_params=query_params_tgt, ) elif release["target_type"] == "directory": target_url = reverse( "browse-directory", url_args={"sha1_git": release["target"]}, query_params=query_params_tgt, ) elif release["target_type"] == "content": target_url = reverse( "browse-content", url_args={"query_string": release["target"]}, query_params=query_params_tgt, ) elif release["target_type"] == "release": target_url = reverse( "browse-release", url_args={"sha1_git": release["target"]}, query_params=query_params_tgt, ) tooltip = ( f"The release {release['name']} " f"is an alias for release {release['target']}" ) release["release_url"] = release_url release["target_url"] = target_url release["tooltip"] = tooltip if origin_info: browse_view_name = "browse-origin-releases" else: browse_view_name = "browse-snapshot-releases" prev_releases_url = None next_releases_url = None if rel_bc: query_params_prev = dict(query_params) query_params_prev["releases_breadcrumbs"] = ",".join(rel_bc[:-1]) prev_releases_url = reverse( browse_view_name, url_args=url_args, query_params=query_params_prev ) elif rel_from: prev_releases_url = reverse( browse_view_name, url_args=url_args, query_params=query_params ) if snapshot and snapshot["next_branch"] is not None: query_params_next = dict(query_params) next_rel = displayed_releases[-1]["branch_name"] del displayed_releases[-1] rel_bc.append(next_rel) query_params_next["releases_breadcrumbs"] = ",".join(rel_bc) next_releases_url = reverse( browse_view_name, url_args=url_args, query_params=query_params_next ) heading = "Releases - " if origin_info: heading += "origin: %s" % origin_info["url"] else: heading += "snapshot: %s" % snapshot_id return render( request, "browse/releases.html", { "heading": heading, "top_panel_visible": False, "top_panel_collapsible": False, "swh_object_name": "Releases", "swh_object_metadata": {}, "top_right_link": None, "displayed_releases": displayed_releases, "prev_releases_url": prev_releases_url, "next_releases_url": next_releases_url, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": False, "search_string": release_name_include or "", }, ) diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py index a10cec56..3eaef3cd 100644 --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -1,724 +1,726 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import stat import textwrap from typing import Tuple import chardet import magic import sentry_sdk from django.utils.html import escape from django.utils.safestring import mark_safe from swh.web.common import archive, highlightjs from swh.web.common.exc import NotFoundExc from swh.web.common.utils import ( browsers_supported_image_mimes, django_cache, format_utc_iso_date, reverse, rst_to_html, ) from swh.web.config import get_config @django_cache() def get_directory_entries(sha1_git): """Function that retrieves the content of a directory from the archive. The directories entries are first sorted in lexicographical order. Sub-directories and regular files are then extracted. Args: sha1_git: sha1_git identifier of the directory Returns: A tuple whose first member corresponds to the sub-directories list and second member the regular files list Raises: NotFoundExc if the directory is not found """ entries = list(archive.lookup_directory(sha1_git)) for e in entries: e["perms"] = stat.filemode(e["perms"]) if e["type"] == "rev": # modify dir entry name to explicitly show it points # to a revision e["name"] = "%s @ %s" % (e["name"], e["target"][:7]) dirs = [e for e in entries if e["type"] in ("dir", "rev")] files = [e for e in entries if e["type"] == "file"] dirs = sorted(dirs, key=lambda d: d["name"]) files = sorted(files, key=lambda f: f["name"]) return dirs, files def get_mimetype_and_encoding_for_content(content): """Function that returns the mime type and the encoding associated to a content buffer using the magic module under the hood. Args: content (bytes): a content buffer Returns: A tuple (mimetype, encoding), for instance ('text/plain', 'us-ascii'), associated to the provided content. """ m = magic.Magic(mime=True, mime_encoding=True) mime_encoding = m.from_buffer(content) mime_type, encoding = mime_encoding.split(";") encoding = encoding.replace(" charset=", "") return mime_type, encoding # maximum authorized content size in bytes for HTML display # with code highlighting content_display_max_size = get_config()["content_display_max_size"] def re_encode_content( mimetype: str, encoding: str, content_data: bytes ) -> Tuple[str, str, bytes]: """Try to re-encode textual content if it is not encoded to UTF-8 for proper display in the browse Web UI. Args: mimetype: content mimetype as detected by python-magic encoding: content encoding as detected by python-magic content_data: raw content bytes Returns: A tuple with 3 members: content mimetype, content encoding (possibly updated after processing), content raw bytes (possibly reencoded to UTF-8) """ if mimetype.startswith("text/") and encoding not in ("us-ascii", "utf-8"): # first check if chardet detects an encoding with confidence result = chardet.detect(content_data) if result["confidence"] >= 0.9: encoding = result["encoding"] content_data = content_data.decode(encoding).encode("utf-8") elif encoding == "unknown-8bit": # probably a malformed UTF-8 content, re-encode it # by replacing invalid chars with a substitution one content_data = content_data.decode("utf-8", "replace").encode("utf-8") elif encoding not in ["utf-8", "binary"]: content_data = content_data.decode(encoding, "replace").encode("utf-8") elif mimetype.startswith("application/octet-stream"): # file may detect a text content as binary # so try to decode it for display encodings = ["us-ascii", "utf-8"] encodings += ["iso-8859-%s" % i for i in range(1, 17)] for enc in encodings: try: content_data = content_data.decode(enc).encode("utf-8") except Exception as exc: sentry_sdk.capture_exception(exc) else: # ensure display in content view encoding = enc mimetype = "text/plain" break return mimetype, encoding, content_data def request_content( - query_string, max_size=content_display_max_size, re_encode=True, + query_string, + max_size=content_display_max_size, + re_encode=True, ): """Function that retrieves a content from the archive. Raw bytes content is first retrieved, then the content mime type. If the mime type is not stored in the archive, it will be computed using Python magic module. Args: query_string: a string of the form "[ALGO_HASH:]HASH" where optional ALGO_HASH can be either ``sha1``, ``sha1_git``, ``sha256``, or ``blake2s256`` (default to ``sha1``) and HASH the hexadecimal representation of the hash value max_size: the maximum size for a content to retrieve (default to 1MB, no size limit if None) Returns: A tuple whose first member corresponds to the content raw bytes and second member the content mime type Raises: NotFoundExc if the content is not found """ content_data = archive.lookup_content(query_string) filetype = None language = None # requests to the indexer db may fail so properly handle # those cases in order to avoid content display errors try: filetype = archive.lookup_content_filetype(query_string) language = archive.lookup_content_language(query_string) except Exception as exc: sentry_sdk.capture_exception(exc) mimetype = "unknown" encoding = "unknown" if filetype: mimetype = filetype["mimetype"] encoding = filetype["encoding"] if not max_size or content_data["length"] < max_size: try: content_raw = archive.lookup_content_raw(query_string) except Exception as exc: sentry_sdk.capture_exception(exc) raise NotFoundExc( "The bytes of the content are currently not available " "in the archive." ) else: content_data["raw_data"] = content_raw["data"] if not filetype: mimetype, encoding = get_mimetype_and_encoding_for_content( content_data["raw_data"] ) if re_encode: mimetype, encoding, raw_data = re_encode_content( mimetype, encoding, content_data["raw_data"] ) content_data["raw_data"] = raw_data else: content_data["raw_data"] = None content_data["mimetype"] = mimetype content_data["encoding"] = encoding if language: content_data["language"] = language["lang"] else: content_data["language"] = "not detected" return content_data def prepare_content_for_display(content_data, mime_type, path): """Function that prepares a content for HTML display. The function tries to associate a programming language to a content in order to perform syntax highlighting client-side using highlightjs. The language is determined using either the content filename or its mime type. If the mime type corresponds to an image format supported by web browsers, the content will be encoded in base64 for displaying the image. Args: content_data (bytes): raw bytes of the content mime_type (string): mime type of the content path (string): path of the content including filename Returns: A dict containing the content bytes (possibly different from the one provided as parameter if it is an image) under the key 'content_data and the corresponding highlightjs language class under the key 'language'. """ language = None if path: language = highlightjs.get_hljs_language_from_filename(path.split("/")[-1]) if language is None: language = highlightjs.get_hljs_language_from_mime_type(mime_type) if language is None: language = "plaintext" if mime_type.startswith("image/"): if mime_type in browsers_supported_image_mimes: content_data = base64.b64encode(content_data).decode("ascii") if mime_type.startswith("image/svg"): mime_type = "image/svg+xml" if mime_type.startswith("text/") or mime_type.startswith("application/"): content_data = content_data.decode("utf-8", errors="replace") return {"content_data": content_data, "language": language, "mimetype": mime_type} def gen_link(url, link_text=None, link_attrs=None): """ Utility function for generating an HTML link to insert in Django templates. Args: url (str): an url link_text (str): optional text for the produced link, if not provided the url will be used link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form '<a href="url">link_text</a>' """ attrs = " " if link_attrs: for k, v in link_attrs.items(): attrs += '%s="%s" ' % (k, v) if not link_text: link_text = url link = '<a%shref="%s">%s</a>' % (attrs, escape(url), escape(link_text)) return mark_safe(link) def _snapshot_context_query_params(snapshot_context): query_params = {} if not snapshot_context: return query_params if snapshot_context and snapshot_context["origin_info"]: origin_info = snapshot_context["origin_info"] snp_query_params = snapshot_context["query_params"] query_params = {"origin_url": origin_info["url"]} if "timestamp" in snp_query_params: query_params["timestamp"] = snp_query_params["timestamp"] if "visit_id" in snp_query_params: query_params["visit_id"] = snp_query_params["visit_id"] if "snapshot" in snp_query_params and "visit_id" not in query_params: query_params["snapshot"] = snp_query_params["snapshot"] elif snapshot_context: query_params = {"snapshot": snapshot_context["snapshot_id"]} if snapshot_context["release"]: query_params["release"] = snapshot_context["release"] elif snapshot_context["branch"] and snapshot_context["branch"] not in ( "HEAD", snapshot_context["revision_id"], ): query_params["branch"] = snapshot_context["branch"] elif snapshot_context["revision_id"]: query_params["revision"] = snapshot_context["revision_id"] return query_params def gen_revision_url(revision_id, snapshot_context=None): """ Utility function for generating an url to a revision. Args: revision_id (str): a revision id snapshot_context (dict): if provided, generate snapshot-dependent browsing url Returns: str: The url to browse the revision """ query_params = _snapshot_context_query_params(snapshot_context) # remove query parameters not needed for a revision view query_params.pop("revision", None) query_params.pop("release", None) return reverse( "browse-revision", url_args={"sha1_git": revision_id}, query_params=query_params ) def gen_revision_link( revision_id, shorten_id=False, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a revision HTML view to insert in Django templates. Args: revision_id (str): a revision id shorten_id (boolean): whether to shorten the revision id to 7 characters for the link text snapshot_context (dict): if provided, generate snapshot-dependent browsing link link_text (str): optional text for the generated link (the revision id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: str: An HTML link in the form '<a href="revision_url">revision_id</a>' """ if not revision_id: return None revision_url = gen_revision_url(revision_id, snapshot_context) if shorten_id: return gen_link(revision_url, revision_id[:7], link_attrs) else: if not link_text: link_text = revision_id return gen_link(revision_url, link_text, link_attrs) def gen_directory_link( sha1_git, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a directory HTML view to insert in Django templates. Args: sha1_git (str): directory identifier link_text (str): optional text for the generated link (the directory id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form '<a href="directory_view_url">link_text</a>' """ if not sha1_git: return None query_params = _snapshot_context_query_params(snapshot_context) directory_url = reverse( "browse-directory", url_args={"sha1_git": sha1_git}, query_params=query_params ) if not link_text: link_text = sha1_git return gen_link(directory_url, link_text, link_attrs) def gen_snapshot_link( snapshot_id, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a snapshot HTML view to insert in Django templates. Args: snapshot_id (str): snapshot identifier link_text (str): optional text for the generated link (the snapshot id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form '<a href="snapshot_view_url">link_text</a>' """ query_params = _snapshot_context_query_params(snapshot_context) snapshot_url = reverse( "browse-snapshot", url_args={"snapshot_id": snapshot_id}, query_params=query_params, ) if not link_text: link_text = snapshot_id return gen_link(snapshot_url, link_text, link_attrs) def gen_content_link( sha1_git, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a content HTML view to insert in Django templates. Args: sha1_git (str): content identifier link_text (str): optional text for the generated link (the content sha1_git will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form '<a href="content_view_url">link_text</a>' """ if not sha1_git: return None query_params = _snapshot_context_query_params(snapshot_context) content_url = reverse( "browse-content", url_args={"query_string": "sha1_git:" + sha1_git}, query_params=query_params, ) if not link_text: link_text = sha1_git return gen_link(content_url, link_text, link_attrs) def get_revision_log_url(revision_id, snapshot_context=None): """ Utility function for getting the URL for a revision log HTML view (possibly in the context of an origin). Args: revision_id (str): revision identifier the history heads to snapshot_context (dict): if provided, generate snapshot-dependent browsing link Returns: The revision log view URL """ query_params = {} if snapshot_context: query_params = _snapshot_context_query_params(snapshot_context) query_params["revision"] = revision_id if snapshot_context and snapshot_context["origin_info"]: revision_log_url = reverse("browse-origin-log", query_params=query_params) elif snapshot_context: url_args = {"snapshot_id": snapshot_context["snapshot_id"]} del query_params["snapshot"] revision_log_url = reverse( "browse-snapshot-log", url_args=url_args, query_params=query_params ) else: revision_log_url = reverse( "browse-revision-log", url_args={"sha1_git": revision_id} ) return revision_log_url def gen_revision_log_link( revision_id, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a revision log HTML view (possibly in the context of an origin) to insert in Django templates. Args: revision_id (str): revision identifier the history heads to snapshot_context (dict): if provided, generate snapshot-dependent browsing link link_text (str): optional text to use for the generated link (the revision id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form '<a href="revision_log_view_url">link_text</a>' """ if not revision_id: return None revision_log_url = get_revision_log_url(revision_id, snapshot_context) if not link_text: link_text = revision_id return gen_link(revision_log_url, link_text, link_attrs) def gen_person_mail_link(person, link_text=None): """ Utility function for generating a mail link to a person to insert in Django templates. Args: person (dict): dictionary containing person data (*name*, *email*, *fullname*) link_text (str): optional text to use for the generated mail link (the person name will be used by default) Returns: str: A mail link to the person or the person name if no email is present in person data """ person_name = person["name"] or person["fullname"] or "None" if link_text is None: link_text = person_name person_email = person["email"] if person["email"] else None if person_email is None and "@" in person_name and " " not in person_name: person_email = person_name if person_email: return gen_link(url="mailto:%s" % person_email, link_text=link_text) else: return person_name def gen_release_link( sha1_git, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a release HTML view to insert in Django templates. Args: sha1_git (str): release identifier link_text (str): optional text for the generated link (the release id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form '<a href="release_view_url">link_text</a>' """ query_params = _snapshot_context_query_params(snapshot_context) release_url = reverse( "browse-release", url_args={"sha1_git": sha1_git}, query_params=query_params ) if not link_text: link_text = sha1_git return gen_link(release_url, link_text, link_attrs) def format_log_entries(revision_log, per_page, snapshot_context=None): """ Utility functions that process raw revision log data for HTML display. Its purpose is to: * add links to relevant browse views * format date in human readable format * truncate the message log Args: revision_log (list): raw revision log as returned by the swh-web api per_page (int): number of log entries per page snapshot_context (dict): if provided, generate snapshot-dependent browsing link """ revision_log_data = [] for i, rev in enumerate(revision_log): if i == per_page: break author_name = "None" author_fullname = "None" committer_fullname = "None" if rev["author"]: author_name = gen_person_mail_link(rev["author"]) author_fullname = rev["author"]["fullname"] if rev["committer"]: committer_fullname = rev["committer"]["fullname"] author_date = format_utc_iso_date(rev["date"]) committer_date = format_utc_iso_date(rev["committer_date"]) tooltip = "revision %s\n" % rev["id"] tooltip += "author: %s\n" % author_fullname tooltip += "author date: %s\n" % author_date tooltip += "committer: %s\n" % committer_fullname tooltip += "committer date: %s\n\n" % committer_date if rev["message"]: tooltip += textwrap.indent(rev["message"], " " * 4) revision_log_data.append( { "author": author_name, "id": rev["id"][:7], "message": rev["message"], "date": author_date, "commit_date": committer_date, "url": gen_revision_url(rev["id"], snapshot_context), "tooltip": tooltip, } ) return revision_log_data # list of common readme names ordered by preference # (lower indices have higher priority) _common_readme_names = [ "readme.markdown", "readme.md", "readme.rst", "readme.txt", "readme", ] def get_readme_to_display(readmes): """ Process a list of readme files found in a directory in order to find the adequate one to display. Args: readmes: a list of dict where keys are readme file names and values are readme sha1s Returns: A tuple (readme_name, readme_sha1) """ readme_name = None readme_url = None readme_sha1 = None readme_html = None lc_readmes = {k.lower(): {"orig_name": k, "sha1": v} for k, v in readmes.items()} # look for readme names according to the preference order # defined by the _common_readme_names list for common_readme_name in _common_readme_names: if common_readme_name in lc_readmes: readme_name = lc_readmes[common_readme_name]["orig_name"] readme_sha1 = lc_readmes[common_readme_name]["sha1"] readme_url = reverse( "browse-content-raw", url_args={"query_string": readme_sha1}, query_params={"re_encode": "true"}, ) break # otherwise pick the first readme like file if any if not readme_name and len(readmes.items()) > 0: readme_name = next(iter(readmes)) readme_sha1 = readmes[readme_name] readme_url = reverse( "browse-content-raw", url_args={"query_string": readme_sha1}, query_params={"re_encode": "true"}, ) # convert rst README to html server side as there is # no viable solution to perform that task client side if readme_name and readme_name.endswith(".rst"): @django_cache( catch_exception=True, exception_return_value="Readme bytes are not available", ) def _rst_readme_to_html(readme_sha1): rst_doc = request_content(readme_sha1) return rst_to_html(rst_doc["raw_data"]) readme_html = _rst_readme_to_html(readme_sha1) return readme_name, readme_url, readme_html diff --git a/swh/web/browse/views/content.py b/swh/web/browse/views/content.py index 84d85225..b69241e8 100644 --- a/swh/web/browse/views/content.py +++ b/swh/web/browse/views/content.py @@ -1,447 +1,452 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import difflib from distutils.util import strtobool import sentry_sdk from django.http import HttpResponse, JsonResponse from django.shortcuts import redirect, render from swh.model.hashutil import hash_to_hex from swh.model.swhids import ObjectType from swh.web.browse.browseurls import browse_route from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.browse.utils import ( content_display_max_size, gen_link, prepare_content_for_display, request_content, ) from swh.web.common import archive, highlightjs, query from swh.web.common.exc import BadInputExc, NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhids_info from swh.web.common.typing import ContentMetadata, SWHObjectInfo from swh.web.common.utils import gen_path_info, reverse, swh_object_icons @browse_route( r"content/(?P<query_string>[0-9a-z_:]*[0-9a-f]+.)/raw/", view_name="browse-content-raw", checksum_args=["query_string"], ) def content_raw(request, query_string): """Django view that produces a raw display of a content identified by its hash value. The url that points to it is :http:get:`/browse/content/[(algo_hash):](hash)/raw/` """ re_encode = bool(strtobool(request.GET.get("re_encode", "false"))) algo, checksum = query.parse_hash(query_string) checksum = hash_to_hex(checksum) content_data = request_content(query_string, max_size=None, re_encode=re_encode) filename = request.GET.get("filename", None) if not filename: filename = "%s_%s" % (algo, checksum) if ( content_data["mimetype"].startswith("text/") or content_data["mimetype"] == "inode/x-empty" ): response = HttpResponse(content_data["raw_data"], content_type="text/plain") response["Content-disposition"] = "filename=%s" % filename else: response = HttpResponse( content_data["raw_data"], content_type="application/octet-stream" ) response["Content-disposition"] = "attachment; filename=%s" % filename return response _auto_diff_size_limit = 20000 @browse_route( r"content/(?P<from_query_string>.*)/diff/(?P<to_query_string>.*)/", view_name="diff-contents", ) def _contents_diff(request, from_query_string, to_query_string): """ Browse endpoint used to compute unified diffs between two contents. Diffs are generated only if the two contents are textual. By default, diffs whose size are greater than 20 kB will not be generated. To force the generation of large diffs, the 'force' boolean query parameter must be used. Args: request: input django http request from_query_string: a string of the form "[ALGO_HASH:]HASH" where optional ALGO_HASH can be either ``sha1``, ``sha1_git``, ``sha256``, or ``blake2s256`` (default to ``sha1``) and HASH the hexadecimal representation of the hash value identifying the first content to_query_string: same as above for identifying the second content Returns: A JSON object containing the unified diff. """ diff_data = {} content_from = None content_to = None content_from_size = 0 content_to_size = 0 content_from_lines = [] content_to_lines = [] force = request.GET.get("force", "false") path = request.GET.get("path", None) language = "plaintext" force = bool(strtobool(force)) if from_query_string == to_query_string: diff_str = "File renamed without changes" else: try: text_diff = True if from_query_string: content_from = request_content(from_query_string, max_size=None) content_from_display_data = prepare_content_for_display( content_from["raw_data"], content_from["mimetype"], path ) language = content_from_display_data["language"] content_from_size = content_from["length"] if not ( content_from["mimetype"].startswith("text/") or content_from["mimetype"] == "inode/x-empty" ): text_diff = False if text_diff and to_query_string: content_to = request_content(to_query_string, max_size=None) content_to_display_data = prepare_content_for_display( content_to["raw_data"], content_to["mimetype"], path ) language = content_to_display_data["language"] content_to_size = content_to["length"] if not ( content_to["mimetype"].startswith("text/") or content_to["mimetype"] == "inode/x-empty" ): text_diff = False diff_size = abs(content_to_size - content_from_size) if not text_diff: diff_str = "Diffs are not generated for non textual content" language = "plaintext" elif not force and diff_size > _auto_diff_size_limit: diff_str = "Large diffs are not automatically computed" language = "plaintext" else: if content_from: content_from_lines = ( content_from["raw_data"].decode("utf-8").splitlines(True) ) if content_from_lines and content_from_lines[-1][-1] != "\n": content_from_lines[-1] += "[swh-no-nl-marker]\n" if content_to: content_to_lines = ( content_to["raw_data"].decode("utf-8").splitlines(True) ) if content_to_lines and content_to_lines[-1][-1] != "\n": content_to_lines[-1] += "[swh-no-nl-marker]\n" diff_lines = difflib.unified_diff(content_from_lines, content_to_lines) diff_str = "".join(list(diff_lines)[2:]) except Exception as exc: sentry_sdk.capture_exception(exc) diff_str = str(exc) diff_data["diff_str"] = diff_str diff_data["language"] = language return JsonResponse(diff_data) def _get_content_from_request(request): path = request.GET.get("path") if path is None: raise BadInputExc("The path query parameter must be provided.") snapshot = request.GET.get("snapshot") or request.GET.get("snapshot_id") origin_url = request.GET.get("origin_url") if snapshot is None and origin_url is None: raise BadInputExc( "The origin_url or snapshot query parameters must be provided." ) visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot, origin_url=origin_url, path=path, timestamp=request.GET.get("timestamp"), visit_id=visit_id or None, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), browse_context="content", ) root_directory = snapshot_context["root_directory"] return archive.lookup_directory_with_path(root_directory, path) @browse_route( r"content/(?P<query_string>[0-9a-z_:]*[0-9a-f]+.)/", r"content/", view_name="browse-content", checksum_args=["query_string"], ) def content_display(request, query_string=None): """Django view that produces an HTML display of a content identified by its hash value. The URLs that points to it are :http:get:`/browse/content/[(algo_hash):](hash)/` :http:get:`/browse/content/` """ if query_string is None: # this case happens when redirected from origin/content or snapshot/content content = _get_content_from_request(request) return redirect( reverse( "browse-content", url_args={"query_string": f"sha1_git:{content['target']}"}, query_params=request.GET, ), ) algo, checksum = query.parse_hash(query_string) checksum = hash_to_hex(checksum) origin_url = request.GET.get("origin_url") selected_language = request.GET.get("language") if not origin_url: origin_url = request.GET.get("origin") snapshot_id = request.GET.get("snapshot") or request.GET.get("snapshot_id") path = request.GET.get("path") content_data = {} error_info = {"status_code": 200, "description": None} try: content_data = request_content(query_string) except NotFoundExc as e: error_info["status_code"] = 404 error_info["description"] = f"NotFoundExc: {str(e)}" snapshot_context = None if origin_url is not None or snapshot_id is not None: try: visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( origin_url=origin_url, snapshot_id=snapshot_id, timestamp=request.GET.get("timestamp"), visit_id=visit_id or None, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=request.GET.get("revision"), path=path, browse_context="content", ) except NotFoundExc as e: if str(e).startswith("Origin"): raw_cnt_url = reverse( "browse-content", url_args={"query_string": query_string} ) error_message = ( "The Software Heritage archive has a content " "with the hash you provided but the origin " "mentioned in your request appears broken: %s. " "Please check the URL and try again.\n\n" "Nevertheless, you can still browse the content " "without origin information: %s" % (gen_link(origin_url), gen_link(raw_cnt_url)) ) raise NotFoundExc(error_message) else: raise e content = None language = None mimetype = None if content_data.get("raw_data") is not None: content_display_data = prepare_content_for_display( content_data["raw_data"], content_data["mimetype"], path ) content = content_display_data["content_data"] language = content_display_data["language"] mimetype = content_display_data["mimetype"] # Override language with user-selected language if selected_language is not None: language = selected_language available_languages = None if mimetype and "text/" in mimetype: available_languages = highlightjs.get_supported_languages() filename = None path_info = None directory_id = None root_dir = None if snapshot_context: root_dir = snapshot_context.get("root_directory") query_params = snapshot_context["query_params"] if snapshot_context else {} breadcrumbs = [] if path: split_path = path.split("/") root_dir = root_dir or split_path[0] filename = split_path[-1] if root_dir != path: path = path.replace(root_dir + "/", "") path = path[: -len(filename)] path_info = gen_path_info(path) query_params.pop("path", None) dir_url = reverse( "browse-directory", url_args={"sha1_git": root_dir}, query_params=query_params, ) breadcrumbs.append({"name": root_dir[:7], "url": dir_url}) for pi in path_info: query_params["path"] = pi["path"] dir_url = reverse( "browse-directory", url_args={"sha1_git": root_dir}, query_params=query_params, ) breadcrumbs.append({"name": pi["name"], "url": dir_url}) breadcrumbs.append({"name": filename, "url": None}) if path and root_dir != path: dir_info = archive.lookup_directory_with_path(root_dir, path) directory_id = dir_info["target"] elif root_dir != path: directory_id = root_dir else: root_dir = None query_params = {"filename": filename} content_checksums = content_data.get("checksums", {}) - content_url = reverse("browse-content", url_args={"query_string": query_string},) + content_url = reverse( + "browse-content", + url_args={"query_string": query_string}, + ) content_raw_url = reverse( "browse-content-raw", url_args={"query_string": query_string}, query_params=query_params, ) content_metadata = ContentMetadata( object_type=ObjectType.CONTENT, object_id=content_checksums.get("sha1_git"), sha1=content_checksums.get("sha1"), sha1_git=content_checksums.get("sha1_git"), sha256=content_checksums.get("sha256"), blake2s256=content_checksums.get("blake2s256"), content_url=content_url, mimetype=content_data.get("mimetype"), encoding=content_data.get("encoding"), size=content_data.get("length", 0), language=content_data.get("language"), root_directory=root_dir, path=f"/{path}" if path else None, filename=filename or "", directory=directory_id, revision=None, release=None, snapshot=None, origin_url=origin_url, ) swh_objects = [] if content_checksums: swh_objects.append( SWHObjectInfo( object_type=ObjectType.CONTENT, object_id=content_checksums.get("sha1_git"), ) ) if directory_id: swh_objects.append( SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=directory_id) ) if snapshot_context: if snapshot_context["revision_id"]: swh_objects.append( SWHObjectInfo( object_type=ObjectType.REVISION, object_id=snapshot_context["revision_id"], ) ) swh_objects.append( SWHObjectInfo( object_type=ObjectType.SNAPSHOT, object_id=snapshot_context["snapshot_id"], ) ) if snapshot_context["release_id"]: swh_objects.append( SWHObjectInfo( object_type=ObjectType.RELEASE, object_id=snapshot_context["release_id"], ) ) swhids_info = get_swhids_info( - swh_objects, snapshot_context, extra_context=content_metadata, + swh_objects, + snapshot_context, + extra_context=content_metadata, ) heading = "Content - %s" % content_checksums.get("sha1_git") if breadcrumbs: content_path = "/".join([bc["name"] for bc in breadcrumbs]) heading += " - %s" % content_path return render( request, "browse/content.html", { "heading": heading, "swh_object_id": swhids_info[0]["swhid"] if swhids_info else "", "swh_object_name": "Content", "swh_object_metadata": content_metadata, "content": content, "content_size": content_data.get("length"), "max_content_size": content_display_max_size, "filename": filename, "encoding": content_data.get("encoding"), "mimetype": mimetype, "language": language, "available_languages": available_languages, "breadcrumbs": breadcrumbs, "top_right_link": { "url": content_raw_url, "icon": swh_object_icons["content"], "text": "Raw File", }, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": True, "swhids_info": swhids_info, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], }, status=error_info["status_code"], ) diff --git a/swh/web/browse/views/directory.py b/swh/web/browse/views/directory.py index 3447f67a..cb598ea6 100644 --- a/swh/web/browse/views/directory.py +++ b/swh/web/browse/views/directory.py @@ -1,286 +1,292 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os import sentry_sdk from django.http import HttpResponse from django.shortcuts import redirect, render from swh.model.swhids import ObjectType from swh.web.browse.browseurls import browse_route from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.browse.utils import gen_link, get_directory_entries, get_readme_to_display from swh.web.common import archive from swh.web.common.exc import NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhids_info from swh.web.common.typing import DirectoryMetadata, SWHObjectInfo from swh.web.common.utils import gen_path_info, reverse, swh_object_icons def _directory_browse(request, sha1_git, path=None): root_sha1_git = sha1_git error_info = {"status_code": 200, "description": None} if path: try: dir_info = archive.lookup_directory_with_path(sha1_git, path) sha1_git = dir_info["target"] except NotFoundExc as e: error_info["status_code"] = 404 error_info["description"] = f"NotFoundExc: {str(e)}" sha1_git = None dirs, files = [], [] if sha1_git is not None: dirs, files = get_directory_entries(sha1_git) origin_url = request.GET.get("origin_url") if not origin_url: origin_url = request.GET.get("origin") snapshot_id = request.GET.get("snapshot") snapshot_context = None if origin_url is not None or snapshot_id is not None: try: snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=request.GET.get("revision"), path=path, ) except NotFoundExc as e: if str(e).startswith("Origin"): raw_dir_url = reverse( "browse-directory", url_args={"sha1_git": sha1_git} ) error_message = ( "The Software Heritage archive has a directory " "with the hash you provided but the origin " "mentioned in your request appears broken: %s. " "Please check the URL and try again.\n\n" "Nevertheless, you can still browse the directory " "without origin information: %s" % (gen_link(origin_url), gen_link(raw_dir_url)) ) raise NotFoundExc(error_message) else: raise e path_info = gen_path_info(path) query_params = snapshot_context["query_params"] if snapshot_context else {} breadcrumbs = [] breadcrumbs.append( { "name": root_sha1_git[:7], "url": reverse( "browse-directory", url_args={"sha1_git": root_sha1_git}, query_params={**query_params, "path": None}, ), } ) for pi in path_info: breadcrumbs.append( { "name": pi["name"], "url": reverse( "browse-directory", url_args={"sha1_git": root_sha1_git}, - query_params={**query_params, "path": pi["path"],}, + query_params={ + **query_params, + "path": pi["path"], + }, ), } ) path = "" if path is None else (path + "/") for d in dirs: if d["type"] == "rev": d["url"] = reverse( "browse-revision", url_args={"sha1_git": d["target"]}, query_params=query_params, ) else: d["url"] = reverse( "browse-directory", url_args={"sha1_git": root_sha1_git}, - query_params={**query_params, "path": path + d["name"],}, + query_params={ + **query_params, + "path": path + d["name"], + }, ) sum_file_sizes = 0 readmes = {} for f in files: query_string = "sha1_git:" + f["target"] f["url"] = reverse( "browse-content", url_args={"query_string": query_string}, query_params={ **query_params, "path": root_sha1_git + "/" + path + f["name"], }, ) if f["length"] is not None: sum_file_sizes += f["length"] if f["name"].lower().startswith("readme"): readmes[f["name"]] = f["checksums"]["sha1"] readme_name, readme_url, readme_html = get_readme_to_display(readmes) dir_metadata = DirectoryMetadata( object_type=ObjectType.DIRECTORY, object_id=sha1_git, directory=root_sha1_git, nb_files=len(files), nb_dirs=len(dirs), sum_file_sizes=sum_file_sizes, root_directory=root_sha1_git, path=f"/{path}" if path else None, revision=None, revision_found=None, release=None, snapshot=None, ) vault_cooking = { "directory_context": True, "directory_swhid": f"swh:1:dir:{sha1_git}", "revision_context": False, "revision_swhid": None, } swh_objects = [SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=sha1_git)] if snapshot_context: if snapshot_context["revision_id"]: swh_objects.append( SWHObjectInfo( object_type=ObjectType.REVISION, object_id=snapshot_context["revision_id"], ) ) swh_objects.append( SWHObjectInfo( object_type=ObjectType.SNAPSHOT, object_id=snapshot_context["snapshot_id"], ) ) if snapshot_context["release_id"]: swh_objects.append( SWHObjectInfo( object_type=ObjectType.RELEASE, object_id=snapshot_context["release_id"], ) ) swhids_info = get_swhids_info(swh_objects, snapshot_context, dir_metadata) heading = "Directory - %s" % sha1_git if breadcrumbs: dir_path = "/".join([bc["name"] for bc in breadcrumbs]) + "/" heading += " - %s" % dir_path top_right_link = None if ( snapshot_context is not None and not snapshot_context["is_empty"] and snapshot_context["revision_id"] is not None ): history_url = reverse( "browse-revision-log", url_args={"sha1_git": snapshot_context["revision_id"]}, query_params=query_params, ) top_right_link = { "url": history_url, "icon": swh_object_icons["revisions history"], "text": "History", } return render( request, "browse/directory.html", { "heading": heading, "swh_object_id": swhids_info[0]["swhid"], "swh_object_name": "Directory", "swh_object_metadata": dir_metadata, "dirs": dirs, "files": files, "breadcrumbs": breadcrumbs, "top_right_link": top_right_link, "readme_name": readme_name, "readme_url": readme_url, "readme_html": readme_html, "snapshot_context": snapshot_context, "vault_cooking": vault_cooking, "show_actions": True, "swhids_info": swhids_info, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], }, status=error_info["status_code"], ) @browse_route( r"directory/(?P<sha1_git>[0-9a-f]+)/", view_name="browse-directory", checksum_args=["sha1_git"], ) def directory_browse(request, sha1_git): """Django view for browsing the content of a directory identified by its sha1_git value. The url that points to it is :http:get:`/browse/directory/(sha1_git)/` """ return _directory_browse(request, sha1_git, request.GET.get("path")) @browse_route( r"directory/(?P<sha1_git>[0-9a-f]+)/(?P<path>.+)/", view_name="browse-directory-legacy", checksum_args=["sha1_git"], ) def directory_browse_legacy(request, sha1_git, path): """Django view for browsing the content of a directory identified by its sha1_git value. The url that points to it is :http:get:`/browse/directory/(sha1_git)/(path)/` """ return _directory_browse(request, sha1_git, path) @browse_route( r"directory/resolve/content-path/(?P<sha1_git>[0-9a-f]+)/", view_name="browse-directory-resolve-content-path", checksum_args=["sha1_git"], ) def _directory_resolve_content_path(request, sha1_git): """ Internal endpoint redirecting to data url for a specific file path relative to a root directory. """ try: path = os.path.normpath(request.GET.get("path")) if not path.startswith("../"): dir_info = archive.lookup_directory_with_path(sha1_git, path) if dir_info["type"] == "file": sha1 = dir_info["checksums"]["sha1"] data_url = reverse( "browse-content-raw", url_args={"query_string": sha1} ) return redirect(data_url) except Exception as exc: sentry_sdk.capture_exception(exc) return HttpResponse(status=404) diff --git a/swh/web/browse/views/origin.py b/swh/web/browse/views/origin.py index 45e659a0..4995b27e 100644 --- a/swh/web/browse/views/origin.py +++ b/swh/web/browse/views/origin.py @@ -1,296 +1,310 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import redirect, render from swh.web.browse.browseurls import browse_route from swh.web.browse.snapshot_context import ( browse_snapshot_directory, get_snapshot_context, ) from swh.web.common import archive from swh.web.common.exc import BadInputExc from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import ( format_utc_iso_date, parse_iso8601_date_to_utc, redirect_to_new_route, reverse, ) @browse_route( - r"origin/directory/", view_name="browse-origin-directory", + r"origin/directory/", + view_name="browse-origin-directory", ) def origin_directory_browse(request): """Django view for browsing the content of a directory associated to an origin for a given visit. The URL that points to it is :http:get:`/browse/origin/directory/` """ return browse_snapshot_directory( request, origin_url=request.GET.get("origin_url"), snapshot_id=request.GET.get("snapshot"), timestamp=request.GET.get("timestamp"), path=request.GET.get("path"), ) @browse_route( r"origin/(?P<origin_url>.+)/visit/(?P<timestamp>.+)/directory/", r"origin/(?P<origin_url>.+)/visit/(?P<timestamp>.+)/directory/(?P<path>.+)/", r"origin/(?P<origin_url>.+)/directory/(?P<path>.+)/", r"origin/(?P<origin_url>.+)/directory/", view_name="browse-origin-directory-legacy", ) def origin_directory_browse_legacy(request, origin_url, timestamp=None, path=None): """Django view for browsing the content of a directory associated to an origin for a given visit. The URLs that point to it are :http:get:`/browse/origin/(origin_url)/directory/[(path)/]` and :http:get:`/browse/origin/(origin_url)/visit/(timestamp)/directory/[(path)/]` """ return browse_snapshot_directory( request, origin_url=origin_url, snapshot_id=request.GET.get("snapshot"), timestamp=timestamp, path=path, ) @browse_route( - r"origin/content/", view_name="browse-origin-content", + r"origin/content/", + view_name="browse-origin-content", ) def origin_content_browse(request): """ This route is deprecated; use http:get:`/browse/content` instead Django view that produces an HTML display of a content associated to an origin for a given visit. The URL that points to it is :http:get:`/browse/origin/content/` """ return redirect_to_new_route(request, "browse-content") @browse_route( r"origin/(?P<origin_url>.+)/visit/(?P<timestamp>.+)/content/(?P<path>.+)/", r"origin/(?P<origin_url>.+)/content/(?P<path>.+)/", r"origin/(?P<origin_url>.+)/content/", view_name="browse-origin-content-legacy", ) def origin_content_browse_legacy(request, origin_url, path=None, timestamp=None): """ This route is deprecated; use http:get:`/browse/content` instead Django view that produces an HTML display of a content associated to an origin for a given visit. The URLs that point to it are :http:get:`/browse/origin/(origin_url)/content/(path)/` and :http:get:`/browse/origin/(origin_url)/visit/(timestamp)/content/(path)/` """ return redirect_to_new_route(request, "browse-content") @browse_route( - r"origin/log/", view_name="browse-origin-log", + r"origin/log/", + view_name="browse-origin-log", ) def origin_log_browse(request): """ This route is deprecated; use http:get:`/browse/snapshot/log` instead Django view that produces an HTML display of revisions history (aka the commit log) associated to a software origin. The URL that points to it is :http:get:`/browse/origin/log/` """ return redirect_to_new_route(request, "browse-snapshot-log") @browse_route( r"origin/(?P<origin_url>.+)/visit/(?P<timestamp>.+)/log/", r"origin/(?P<origin_url>.+)/log/", view_name="browse-origin-log-legacy", ) def origin_log_browse_legacy(request, origin_url, timestamp=None): """ This route is deprecated; use http:get:`/browse/snapshot/log` instead Django view that produces an HTML display of revisions history (aka the commit log) associated to a software origin. The URLs that point to it are :http:get:`/browse/origin/(origin_url)/log/` and :http:get:`/browse/origin/(origin_url)/visit/(timestamp)/log/` """ - return redirect_to_new_route(request, "browse-snapshot-log",) + return redirect_to_new_route( + request, + "browse-snapshot-log", + ) @browse_route( - r"origin/branches/", view_name="browse-origin-branches", + r"origin/branches/", + view_name="browse-origin-branches", ) def origin_branches_browse(request): """ This route is deprecated; use http:get:`/browse/snapshot/branches` instead Django view that produces an HTML display of the list of branches associated to an origin for a given visit. The URL that points to it is :http:get:`/browse/origin/branches/` """ return redirect_to_new_route(request, "browse-snapshot-branches") @browse_route( r"origin/(?P<origin_url>.+)/visit/(?P<timestamp>.+)/branches/", r"origin/(?P<origin_url>.+)/branches/", view_name="browse-origin-branches-legacy", ) def origin_branches_browse_legacy(request, origin_url, timestamp=None): """ This route is deprecated; use http:get:`/browse/snapshot/branches` instead Django view that produces an HTML display of the list of branches associated to an origin for a given visit. The URLs that point to it are :http:get:`/browse/origin/(origin_url)/branches/` and :http:get:`/browse/origin/(origin_url)/visit/(timestamp)/branches/` """ return redirect_to_new_route(request, "browse-snapshot-branches") @browse_route( - r"origin/releases/", view_name="browse-origin-releases", + r"origin/releases/", + view_name="browse-origin-releases", ) def origin_releases_browse(request): """ This route is deprecated; use http:get:`/browse/snapshot/releases` instead Django view that produces an HTML display of the list of releases associated to an origin for a given visit. The URL that points to it is :http:get:`/browse/origin/releases/` """ return redirect_to_new_route(request, "browse-snapshot-releases") @browse_route( r"origin/(?P<origin_url>.+)/visit/(?P<timestamp>.+)/releases/", r"origin/(?P<origin_url>.+)/releases/", view_name="browse-origin-releases-legacy", ) def origin_releases_browse_legacy(request, origin_url, timestamp=None): """ This route is deprecated; use http:get:`/browse/snapshot/releases` instead Django view that produces an HTML display of the list of releases associated to an origin for a given visit. The URLs that point to it are :http:get:`/browse/origin/(origin_url)/releases/` and :http:get:`/browse/origin/(origin_url)/visit/(timestamp)/releases/` """ return redirect_to_new_route(request, "browse-snapshot-releases") def _origin_visits_browse(request, origin_url): if origin_url is None: raise BadInputExc("An origin URL must be provided as query parameter.") origin_info = archive.lookup_origin({"url": origin_url}) origin_visits = get_origin_visits(origin_info) snapshot_context = get_snapshot_context(origin_url=origin_url) for i, visit in enumerate(origin_visits): url_date = format_utc_iso_date(visit["date"], "%Y-%m-%dT%H:%M:%SZ") visit["formatted_date"] = format_utc_iso_date(visit["date"]) query_params = {"origin_url": origin_url, "timestamp": url_date} if i < len(origin_visits) - 1: if visit["date"] == origin_visits[i + 1]["date"]: query_params = {"visit_id": visit["visit"]} if i > 0: if visit["date"] == origin_visits[i - 1]["date"]: query_params = {"visit_id": visit["visit"]} snapshot = visit["snapshot"] if visit["snapshot"] else "" - visit["url"] = reverse("browse-origin-directory", query_params=query_params,) + visit["url"] = reverse( + "browse-origin-directory", + query_params=query_params, + ) if not snapshot: visit["snapshot"] = "" visit["date"] = parse_iso8601_date_to_utc(visit["date"]).timestamp() heading = "Origin visits - %s" % origin_url return render( request, "browse/origin-visits.html", { "heading": heading, "swh_object_name": "Visits", "swh_object_metadata": origin_info, "origin_visits": origin_visits, "origin_info": origin_info, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": False, }, ) @browse_route(r"origin/visits/", view_name="browse-origin-visits") def origin_visits_browse(request): """Django view that produces an HTML display of visits reporting for a given origin. The URL that points to it is :http:get:`/browse/origin/visits/`. """ return _origin_visits_browse(request, request.GET.get("origin_url")) @browse_route( r"origin/(?P<origin_url>.+)/visits/", view_name="browse-origin-visits-legacy" ) def origin_visits_browse_legacy(request, origin_url): """Django view that produces an HTML display of visits reporting for a given origin. The URL that points to it is :http:get:`/browse/origin/(origin_url)/visits/`. """ return _origin_visits_browse(request, origin_url) @browse_route(r"origin/", view_name="browse-origin") def origin_browse(request): """Django view that redirects to the display of the latest archived snapshot for a given software origin. """ - last_snapshot_url = reverse("browse-origin-directory", query_params=request.GET,) + last_snapshot_url = reverse( + "browse-origin-directory", + query_params=request.GET, + ) return redirect(last_snapshot_url) @browse_route(r"origin/(?P<origin_url>.+)/", view_name="browse-origin-legacy") def origin_browse_legacy(request, origin_url): """Django view that redirects to the display of the latest archived snapshot for a given software origin. """ last_snapshot_url = reverse( "browse-origin-directory", query_params={"origin_url": origin_url, **request.GET}, ) return redirect(last_snapshot_url) diff --git a/swh/web/browse/views/revision.py b/swh/web/browse/views/revision.py index bbaab1ed..b3f020ed 100644 --- a/swh/web/browse/views/revision.py +++ b/swh/web/browse/views/revision.py @@ -1,587 +1,589 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import json import textwrap from django.http import JsonResponse from django.shortcuts import render from django.utils.safestring import mark_safe from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType from swh.web.browse.browseurls import browse_route from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.browse.utils import ( content_display_max_size, format_log_entries, gen_link, gen_person_mail_link, gen_revision_url, get_directory_entries, get_readme_to_display, get_revision_log_url, prepare_content_for_display, request_content, ) from swh.web.common import archive from swh.web.common.exc import NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhids_info from swh.web.common.typing import RevisionMetadata, SWHObjectInfo from swh.web.common.utils import ( format_utc_iso_date, gen_path_info, reverse, swh_object_icons, ) def _gen_content_url(revision, query_string, path, snapshot_context): if snapshot_context: query_params = snapshot_context["query_params"] query_params["path"] = path query_params["revision"] = revision["id"] content_url = reverse("browse-origin-content", query_params=query_params) else: content_path = "%s/%s" % (revision["directory"], path) content_url = reverse( "browse-content", url_args={"query_string": query_string}, query_params={"path": content_path}, ) return content_url def _gen_diff_link(idx, diff_anchor, link_text): if idx < _max_displayed_file_diffs: return gen_link(diff_anchor, link_text) else: return link_text # TODO: put in conf _max_displayed_file_diffs = 1000 def _gen_revision_changes_list(revision, changes, snapshot_context): """ Returns a HTML string describing the file changes introduced in a revision. As this string will be displayed in the browse revision view, links to adequate file diffs are also generated. Args: revision (str): hexadecimal representation of a revision identifier changes (list): list of file changes in the revision snapshot_context (dict): optional origin context used to reverse the content urls Returns: A string to insert in a revision HTML view. """ changes_msg = [] for i, change in enumerate(changes): hasher = hashlib.sha1() from_query_string = "" to_query_string = "" diff_id = "diff-" if change["from"]: from_query_string = "sha1_git:" + change["from"]["target"] diff_id += change["from"]["target"] + "-" + change["from_path"] diff_id += "-" if change["to"]: to_query_string = "sha1_git:" + change["to"]["target"] diff_id += change["to"]["target"] + change["to_path"] change["path"] = change["to_path"] or change["from_path"] url_args = { "from_query_string": from_query_string, "to_query_string": to_query_string, } query_params = {"path": change["path"]} change["diff_url"] = reverse( "diff-contents", url_args=url_args, query_params=query_params ) hasher.update(diff_id.encode("utf-8")) diff_id = hasher.hexdigest() change["id"] = diff_id diff_link = "#diff_" + diff_id if change["type"] == "modify": change["content_url"] = _gen_content_url( revision, to_query_string, change["to_path"], snapshot_context ) changes_msg.append( "modified: %s" % _gen_diff_link(i, diff_link, change["to_path"]) ) elif change["type"] == "insert": change["content_url"] = _gen_content_url( revision, to_query_string, change["to_path"], snapshot_context ) changes_msg.append( "new file: %s" % _gen_diff_link(i, diff_link, change["to_path"]) ) elif change["type"] == "delete": parent = archive.lookup_revision(revision["parents"][0]) change["content_url"] = _gen_content_url( parent, from_query_string, change["from_path"], snapshot_context ) changes_msg.append( "deleted: %s" % _gen_diff_link(i, diff_link, change["from_path"]) ) elif change["type"] == "rename": change["content_url"] = _gen_content_url( revision, to_query_string, change["to_path"], snapshot_context ) link_text = change["from_path"] + " → " + change["to_path"] changes_msg.append( "renamed: %s" % _gen_diff_link(i, diff_link, link_text) ) if not changes: changes_msg.append("No changes") return mark_safe("\n".join(changes_msg)) @browse_route( r"revision/(?P<sha1_git>[0-9a-f]+)/diff/", view_name="diff-revision", checksum_args=["sha1_git"], ) def _revision_diff(request, sha1_git): """ Browse internal endpoint to compute revision diff """ revision = archive.lookup_revision(sha1_git) snapshot_context = None origin_url = request.GET.get("origin_url", None) if not origin_url: origin_url = request.GET.get("origin", None) timestamp = request.GET.get("timestamp", None) visit_id = request.GET.get("visit_id", None) if origin_url: snapshot_context = get_snapshot_context( origin_url=origin_url, timestamp=timestamp, visit_id=visit_id ) changes = archive.diff_revision(sha1_git) changes_msg = _gen_revision_changes_list(revision, changes, snapshot_context) diff_data = { "total_nb_changes": len(changes), "changes": changes[:_max_displayed_file_diffs], "changes_msg": changes_msg, } return JsonResponse(diff_data) NB_LOG_ENTRIES = 100 @browse_route( r"revision/(?P<sha1_git>[0-9a-f]+)/log/", view_name="browse-revision-log", checksum_args=["sha1_git"], ) def revision_log_browse(request, sha1_git): """ Django view that produces an HTML display of the history log for a revision identified by its id. The url that points to it is :http:get:`/browse/revision/(sha1_git)/log/` """ origin_url = request.GET.get("origin_url") snapshot_id = request.GET.get("snapshot") snapshot_context = None if origin_url or snapshot_id: visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=request.GET.get("timestamp"), visit_id=visit_id or None, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=sha1_git, ) per_page = int(request.GET.get("per_page", NB_LOG_ENTRIES)) offset = int(request.GET.get("offset", 0)) revs_ordering = request.GET.get("revs_ordering", "committer_date") session_key = "rev_%s_log_ordering_%s" % (sha1_git, revs_ordering) rev_log_session = request.session.get(session_key, None) rev_log = [] revs_walker_state = None if rev_log_session: rev_log = rev_log_session["rev_log"] revs_walker_state = rev_log_session["revs_walker_state"] if len(rev_log) < offset + per_page: revs_walker = archive.get_revisions_walker( revs_ordering, sha1_git, max_revs=offset + per_page + 1, state=revs_walker_state, ) rev_log += [rev["id"] for rev in revs_walker] revs_walker_state = revs_walker.export_state() revs = rev_log[offset : offset + per_page] revision_log = archive.lookup_revision_multiple(revs) request.session[session_key] = { "rev_log": rev_log, "revs_walker_state": revs_walker_state, } revs_ordering = request.GET.get("revs_ordering", "") prev_log_url = None if len(rev_log) > offset + per_page: prev_log_url = reverse( "browse-revision-log", url_args={"sha1_git": sha1_git}, query_params={ "per_page": per_page, "offset": offset + per_page, "revs_ordering": revs_ordering or None, }, ) next_log_url = None if offset != 0: next_log_url = reverse( "browse-revision-log", url_args={"sha1_git": sha1_git}, query_params={ "per_page": per_page, "offset": offset - per_page, "revs_ordering": revs_ordering or None, }, ) revision_log_data = format_log_entries(revision_log, per_page) swh_rev_id = str( CoreSWHID(object_type=ObjectType.REVISION, object_id=hash_to_bytes(sha1_git)) ) return render( request, "browse/revision-log.html", { "heading": "Revision history", "swh_object_id": swh_rev_id, "swh_object_name": "Revisions history", "swh_object_metadata": None, "revision_log": revision_log_data, "revs_ordering": revs_ordering, "next_log_url": next_log_url, "prev_log_url": prev_log_url, "breadcrumbs": None, "top_right_link": None, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": True, "swhids_info": None, }, ) @browse_route( r"revision/(?P<sha1_git>[0-9a-f]+)/", view_name="browse-revision", checksum_args=["sha1_git"], ) def revision_browse(request, sha1_git): """ Django view that produces an HTML display of a revision identified by its id. The url that points to it is :http:get:`/browse/revision/(sha1_git)/`. """ revision = archive.lookup_revision(sha1_git) origin_info = None snapshot_context = None origin_url = request.GET.get("origin_url") if not origin_url: origin_url = request.GET.get("origin") timestamp = request.GET.get("timestamp") visit_id = int(request.GET.get("visit_id", 0)) snapshot_id = request.GET.get("snapshot_id") if not snapshot_id: snapshot_id = request.GET.get("snapshot") path = request.GET.get("path") dir_id = None dirs, files = [], [] content_data = {} if origin_url: try: snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=timestamp, visit_id=visit_id or None, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=sha1_git, path=path, ) except NotFoundExc as e: raw_rev_url = reverse("browse-revision", url_args={"sha1_git": sha1_git}) error_message = ( "The Software Heritage archive has a revision " "with the hash you provided but the origin " "mentioned in your request appears broken: %s. " "Please check the URL and try again.\n\n" "Nevertheless, you can still browse the revision " "without origin information: %s" % (gen_link(origin_url), gen_link(raw_rev_url)) ) if str(e).startswith("Origin"): raise NotFoundExc(error_message) else: raise e origin_info = snapshot_context["origin_info"] snapshot_id = snapshot_context["snapshot_id"] elif snapshot_id: snapshot_context = get_snapshot_context(snapshot_id) error_info = {"status_code": 200, "description": None} if path: try: file_info = archive.lookup_directory_with_path(revision["directory"], path) if file_info["type"] == "dir": dir_id = file_info["target"] else: query_string = "sha1_git:" + file_info["target"] content_data = request_content(query_string) except NotFoundExc as e: error_info["status_code"] = 404 error_info["description"] = f"NotFoundExc: {str(e)}" else: dir_id = revision["directory"] if dir_id: path = "" if path is None else (path + "/") dirs, files = get_directory_entries(dir_id) revision_metadata = RevisionMetadata( object_type=ObjectType.REVISION, object_id=sha1_git, revision=sha1_git, author=revision["author"]["fullname"] if revision["author"] else "None", author_url=gen_person_mail_link(revision["author"]) if revision["author"] else "None", committer=revision["committer"]["fullname"] if revision["committer"] else "None", committer_url=gen_person_mail_link(revision["committer"]) if revision["committer"] else "None", committer_date=format_utc_iso_date(revision["committer_date"]), date=format_utc_iso_date(revision["date"]), directory=revision["directory"], merge=revision["merge"], metadata=json.dumps( revision["metadata"], sort_keys=True, indent=4, separators=(",", ": ") ), parents=revision["parents"], synthetic=revision["synthetic"], type=revision["type"], snapshot=snapshot_id, origin_url=origin_url, ) message_lines = ["None"] if revision["message"]: message_lines = revision["message"].split("\n") parents = [] for p in revision["parents"]: parent_url = gen_revision_url(p, snapshot_context) parents.append({"id": p, "url": parent_url}) path_info = gen_path_info(path) query_params = snapshot_context["query_params"] if snapshot_context else {} breadcrumbs = [] breadcrumbs.append( { "name": revision["directory"][:7], "url": reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ), } ) for pi in path_info: query_params["path"] = pi["path"] breadcrumbs.append( { "name": pi["name"], "url": reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ), } ) vault_cooking = { "directory_context": False, "directory_swhid": None, "revision_context": True, "revision_swhid": f"swh:1:rev:{sha1_git}", } swh_objects = [SWHObjectInfo(object_type=ObjectType.REVISION, object_id=sha1_git)] content = None content_size = None filename = None mimetype = None language = None readme_name = None readme_url = None readme_html = None readmes = {} extra_context = dict(revision_metadata) extra_context["path"] = f"/{path}" if path else None if content_data: breadcrumbs[-1]["url"] = None content_size = content_data["length"] mimetype = content_data["mimetype"] if content_data["raw_data"]: content_display_data = prepare_content_for_display( content_data["raw_data"], content_data["mimetype"], path ) content = content_display_data["content_data"] language = content_display_data["language"] mimetype = content_display_data["mimetype"] if path: filename = path_info[-1]["name"] query_params["filename"] = filename filepath = "/".join(pi["name"] for pi in path_info[:-1]) extra_context["path"] = f"/{filepath}/" if filepath else "/" extra_context["filename"] = filename top_right_link = { "url": reverse( "browse-content-raw", url_args={"query_string": query_string}, query_params={"filename": filename}, ), "icon": swh_object_icons["content"], "text": "Raw File", } swh_objects.append( SWHObjectInfo(object_type=ObjectType.CONTENT, object_id=file_info["target"]) ) else: for d in dirs: if d["type"] == "rev": d["url"] = reverse( "browse-revision", url_args={"sha1_git": d["target"]} ) else: query_params["path"] = path + d["name"] d["url"] = reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ) for f in files: query_params["path"] = path + f["name"] f["url"] = reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ) if f["name"].lower().startswith("readme"): readmes[f["name"]] = f["checksums"]["sha1"] readme_name, readme_url, readme_html = get_readme_to_display(readmes) top_right_link = { "url": get_revision_log_url(sha1_git, snapshot_context), "icon": swh_object_icons["revisions history"], "text": "History", } vault_cooking["directory_context"] = True vault_cooking["directory_swhid"] = f"swh:1:dir:{dir_id}" swh_objects.append( SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=dir_id) ) query_params.pop("path", None) diff_revision_url = reverse( - "diff-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, + "diff-revision", + url_args={"sha1_git": sha1_git}, + query_params=query_params, ) if snapshot_id: swh_objects.append( SWHObjectInfo(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id) ) swhids_info = get_swhids_info(swh_objects, snapshot_context, extra_context) heading = "Revision - %s - %s" % ( sha1_git[:7], textwrap.shorten(message_lines[0], width=70), ) if snapshot_context: context_found = "snapshot: %s" % snapshot_context["snapshot_id"] if origin_info: context_found = "origin: %s" % origin_info["url"] heading += " - %s" % context_found return render( request, "browse/revision.html", { "heading": heading, "swh_object_id": swhids_info[0]["swhid"], "swh_object_name": "Revision", "swh_object_metadata": revision_metadata, "message_header": message_lines[0], "message_body": "\n".join(message_lines[1:]), "parents": parents, "snapshot_context": snapshot_context, "dirs": dirs, "files": files, "content": content, "content_size": content_size, "max_content_size": content_display_max_size, "filename": filename, "encoding": content_data.get("encoding"), "mimetype": mimetype, "language": language, "readme_name": readme_name, "readme_url": readme_url, "readme_html": readme_html, "breadcrumbs": breadcrumbs, "top_right_link": top_right_link, "vault_cooking": vault_cooking, "diff_revision_url": diff_revision_url, "show_actions": True, "swhids_info": swhids_info, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], }, status=error_info["status_code"], ) diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py index 9a9869ec..e9de7b46 100644 --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -1,1452 +1,1454 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import datetime import itertools import os import re from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union from urllib.parse import urlparse from swh.model import hashutil from swh.model.model import OriginVisit, Revision from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos import diff, revisions_walker from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_latest, snapshot_resolve_alias from swh.vault.exc import NotFoundExc as VaultNotFoundExc from swh.web import config from swh.web.common import converters, query from swh.web.common.exc import NotFoundExc from swh.web.common.typing import ( OriginInfo, OriginMetadataInfo, OriginVisitInfo, PagedResult, ) search = config.search() storage = config.storage() vault = config.vault() idx_storage = config.indexer_storage() counters = config.counters() MAX_LIMIT = 50 # Top limit the users can ask for def _first_element(lst): """Returns the first element in the provided list or None if it is empty or None""" return next(iter(lst or []), None) def lookup_multiple_hashes(hashes): """Lookup the passed hashes in a single DB connection, using batch processing. Args: An array of {filename: X, sha1: Y}, string X, hex sha1 string Y. Returns: The same array with elements updated with elem['found'] = true if the hash is present in storage, elem['found'] = false if not. """ hashlist = [hashutil.hash_to_bytes(elem["sha1"]) for elem in hashes] content_missing = storage.content_missing_per_sha1(hashlist) missing = [hashutil.hash_to_hex(x) for x in content_missing] for x in hashes: x.update({"found": True}) for h in hashes: if h["sha1"] in missing: h["found"] = False return hashes def lookup_expression(expression, last_sha1, per_page): """Lookup expression in raw content. Args: expression (str): An expression to lookup through raw indexed content last_sha1 (str): Last sha1 seen per_page (int): Number of results per page Yields: ctags whose content match the expression """ limit = min(per_page, MAX_LIMIT) ctags = idx_storage.content_ctags_search( expression, last_sha1=last_sha1, limit=limit ) for ctag in ctags: ctag = converters.from_swh(ctag, hashess={"id"}) ctag["sha1"] = ctag["id"] ctag.pop("id") yield ctag def lookup_hash(q: str) -> Dict[str, Any]: """Check if the storage contains a given content checksum and return it if found. Args: q: query string of the form <hash_algo:hash> Returns: Dict with key found containing the hash info if the hash is present, None if not. """ algo, hash_ = query.parse_hash(q) found = _first_element(storage.content_find({algo: hash_})) if found: content = converters.from_content(found.to_dict()) else: content = None return {"found": content, "algo": algo} def search_hash(q: str) -> Dict[str, bool]: """Search storage for a given content checksum. Args: q: query string of the form <hash_algo:hash> Returns: Dict with key found to True or False, according to whether the checksum is present or not """ algo, hash_ = query.parse_hash(q) found = _first_element(storage.content_find({algo: hash_})) return {"found": found is not None} def _lookup_content_sha1(q: str) -> Optional[bytes]: """Given a possible input, query for the content's sha1. Args: q: query string of the form <hash_algo:hash> Returns: binary sha1 if found or None """ algo, hash_ = query.parse_hash(q) if algo != "sha1": hashes = _first_element(storage.content_find({algo: hash_})) if not hashes: return None return hashes.sha1 return hash_ def lookup_content_ctags(q): """Return ctags information from a specified content. Args: q: query string of the form <hash_algo:hash> Yields: ctags information (dict) list if the content is found. """ sha1 = _lookup_content_sha1(q) if not sha1: return None ctags = list(idx_storage.content_ctags_get([sha1])) if not ctags: return None for ctag in ctags: yield converters.from_swh(ctag, hashess={"id"}) def lookup_content_filetype(q): """Return filetype information from a specified content. Args: q: query string of the form <hash_algo:hash> Yields: filetype information (dict) list if the content is found. """ sha1 = _lookup_content_sha1(q) if not sha1: return None filetype = _first_element(list(idx_storage.content_mimetype_get([sha1]))) if not filetype: return None return converters.from_filetype(filetype.to_dict()) def lookup_content_language(q): """Always returns None. This used to return language information from a specified content, but this is currently disabled. Args: q: query string of the form <hash_algo:hash> Yields: language information (dict) list if the content is found. """ return None def lookup_content_license(q): """Return license information from a specified content. Args: q: query string of the form <hash_algo:hash> Yields: license information (dict) list if the content is found. """ sha1 = _lookup_content_sha1(q) if not sha1: return None licenses = list(idx_storage.content_fossology_license_get([sha1])) if not licenses: return None license_dicts = [license.to_dict() for license in licenses] for license_dict in license_dicts: del license_dict["id"] lic = { "id": sha1, "facts": license_dicts, } return converters.from_swh(lic, hashess={"id"}) def lookup_origin(origin: OriginInfo) -> OriginInfo: """Return information about the origin matching dict origin. Args: origin: origin's dict with 'url' key Returns: origin information as dict. """ origin_urls = [origin["url"]] if origin["url"]: # handle case when user provided an origin url with a trailing # slash while the url in storage does not have it (e.g. GitHub) if origin["url"].endswith("/"): origin_urls.append(origin["url"][:-1]) # handle case when user provided an origin url without a trailing # slash while the url in storage have it (e.g. Debian source package) else: origin_urls.append(f"{origin['url']}/") try: # handle case where the "://" character sequence was mangled into ":/" parsed_url = urlparse(origin["url"]) if ( parsed_url.scheme and not parsed_url.netloc and origin["url"].startswith(f"{parsed_url.scheme}:/") and not origin["url"].startswith(f"{parsed_url.scheme}://") ): origin_urls.append( origin["url"].replace( f"{parsed_url.scheme}:/", f"{parsed_url.scheme}://" ) ) except Exception: pass origins = [o for o in storage.origin_get(origin_urls) if o is not None] if not origins: msg = "Origin with url %s not found!" % origin["url"] raise NotFoundExc(msg) return converters.from_origin(origins[0].to_dict()) def lookup_origins( page_token: Optional[str], limit: int = 100 ) -> PagedResult[OriginInfo]: """Get list of archived software origins in a paginated way. Origins are sorted by id before returning them Args: origin_from (int): The minimum id of the origins to return origin_count (int): The maximum number of origins to return Returns: Page of OriginInfo """ page = storage.origin_list(page_token=page_token, limit=limit) return PagedResult( [converters.from_origin(o.to_dict()) for o in page.results], next_page_token=page.next_page_token, ) def lookup_origin_snapshots(origin: OriginInfo) -> List[str]: """Return ids of the snapshots of an origin. Args: origin: origin's dict with 'url' key Returns: List of unique snapshot identifiers in hexadecimal format resulting from the visits of the origin. """ return [ snapshot.hex() for snapshot in storage.origin_snapshot_get_all(origin["url"]) ] def search_origin( url_pattern: str, use_ql: bool = False, limit: int = 50, with_visit: bool = False, visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, ) -> Tuple[List[OriginInfo], Optional[str]]: """Search for origins whose urls contain a provided string pattern or match a provided regular expression. Args: url_pattern: the string pattern to search for in origin urls use_ql: whether to use swh search query language or not limit: the maximum number of found origins to return with_visit: Whether origins with no visit are to be filtered out visit_types: Only origins having any of the provided visit types (e.g. git, svn, pypi) will be returned page_token: opaque string used to get the next results of a search Returns: list of origin information as dict. """ if page_token: assert isinstance(page_token, str) if search: if use_ql: page_result = search.origin_search( query=url_pattern, page_token=page_token, with_visit=with_visit, visit_types=visit_types, limit=limit, ) else: page_result = search.origin_search( url_pattern=url_pattern, page_token=page_token, with_visit=with_visit, visit_types=visit_types, limit=limit, ) origins = [converters.from_origin(ori_dict) for ori_dict in page_result.results] else: # Fallback to swh-storage if swh-search is not configured search_words = [re.escape(word) for word in url_pattern.split()] if len(search_words) >= 7: url_pattern = ".*".join(search_words) else: pattern_parts = [] for permut in itertools.permutations(search_words): pattern_parts.append(".*".join(permut)) url_pattern = "|".join(pattern_parts) page_result = storage.origin_search( url_pattern, page_token=page_token, with_visit=with_visit, limit=limit, visit_types=visit_types, regexp=True, ) origins = [converters.from_origin(ori.to_dict()) for ori in page_result.results] return (origins, page_result.next_page_token) def search_origin_metadata( fulltext: str, limit: int = 50 ) -> Iterable[OriginMetadataInfo]: """Search for origins whose metadata match a provided string pattern. Args: fulltext: the string pattern to search for in origin metadata limit: the maximum number of found origins to return Returns: Iterable of origin metadata information for existing origins """ results = [] if ( search and config.get_config()["search_config"]["metadata_backend"] == "swh-search" ): - page_result = search.origin_search(metadata_pattern=fulltext, limit=limit,) + page_result = search.origin_search( + metadata_pattern=fulltext, + limit=limit, + ) matches = idx_storage.origin_intrinsic_metadata_get( [r["url"] for r in page_result.results] ) else: matches = idx_storage.origin_intrinsic_metadata_search_fulltext( conjunction=[fulltext], limit=limit ) matches = [match.to_dict() for match in matches] origins = storage.origin_get([match["id"] for match in matches]) for origin, match in zip(origins, matches): if not origin: continue match["from_revision"] = hashutil.hash_to_hex(match["from_revision"]) del match["id"] results.append(OriginMetadataInfo(url=origin.url, metadata=match)) return results def lookup_origin_intrinsic_metadata(origin_url: str) -> Dict[str, Any]: """Return intrinsic metadata for origin whose origin matches given origin. Args: origin_url: origin url Raises: NotFoundExc when the origin is not found Returns: origin metadata. """ origins = [origin_url] origin_info = storage.origin_get(origins)[0] if not origin_info: raise NotFoundExc(f"Origin with url {origin_url} not found!") match = _first_element(idx_storage.origin_intrinsic_metadata_get(origins)) result = {} if match: result = match.metadata return result def _to_sha1_bin(sha1_hex): _, sha1_git_bin = query.parse_hash_with_algorithms_or_throws( sha1_hex, ["sha1"], "Only sha1_git is supported." # HACK: sha1_git really ) return sha1_git_bin def _check_directory_exists(sha1_git, sha1_git_bin): if len(list(storage.directory_missing([sha1_git_bin]))): raise NotFoundExc("Directory with sha1_git %s not found" % sha1_git) def lookup_directory(sha1_git): """Return information about the directory with id sha1_git. Args: sha1_git as string Returns: directory information as dict. """ empty_dir_sha1 = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" if sha1_git == empty_dir_sha1: return [] sha1_git_bin = _to_sha1_bin(sha1_git) _check_directory_exists(sha1_git, sha1_git_bin) directory_entries = storage.directory_ls(sha1_git_bin) return map(converters.from_directory_entry, directory_entries) def lookup_directory_with_path(sha1_git: str, path: str) -> Dict[str, Any]: """Return directory information for entry with specified path w.r.t. root directory pointed by sha1_git Args: sha1_git: sha1_git corresponding to the directory to which we append paths to (hopefully) find the entry path: the relative path to the entry starting from the root directory pointed by sha1_git Returns: Directory entry information as dict. Raises: NotFoundExc if the directory entry is not found """ sha1_git_bin = _to_sha1_bin(sha1_git) _check_directory_exists(sha1_git, sha1_git_bin) paths = path.strip(os.path.sep).split(os.path.sep) queried_dir = storage.directory_entry_get_by_path( sha1_git_bin, [p.encode("utf-8") for p in paths] ) if not queried_dir: raise NotFoundExc( f"Directory entry with path {path} from root directory {sha1_git} not found" ) return converters.from_directory_entry(queried_dir) def lookup_release(release_sha1_git: str) -> Dict[str, Any]: """Return information about the release with sha1 release_sha1_git. Args: release_sha1_git: The release's sha1 as hexadecimal Returns: Release information as dict. Raises: ValueError if the identifier provided is not of sha1 nature. """ sha1_git_bin = _to_sha1_bin(release_sha1_git) release = _first_element(storage.release_get([sha1_git_bin])) if not release: raise NotFoundExc(f"Release with sha1_git {release_sha1_git} not found.") return converters.from_release(release) def lookup_release_multiple(sha1_git_list) -> Iterator[Optional[Dict[str, Any]]]: """Return information about the releases identified with their sha1_git identifiers. Args: sha1_git_list: A list of release sha1_git identifiers Returns: Iterator of Release metadata information as dict. Raises: ValueError if the identifier provided is not of sha1 nature. """ sha1_bin_list = [_to_sha1_bin(sha1_git) for sha1_git in sha1_git_list] releases = storage.release_get(sha1_bin_list) for r in releases: if r is not None: yield converters.from_release(r) else: yield None def lookup_revision(rev_sha1_git) -> Dict[str, Any]: """Return information about the revision with sha1 revision_sha1_git. Args: revision_sha1_git: The revision's sha1 as hexadecimal Returns: Revision information as dict. Raises: ValueError if the identifier provided is not of sha1 nature. NotFoundExc if there is no revision with the provided sha1_git. """ sha1_git_bin = _to_sha1_bin(rev_sha1_git) revision = storage.revision_get([sha1_git_bin])[0] if not revision: raise NotFoundExc(f"Revision with sha1_git {rev_sha1_git} not found.") return converters.from_revision(revision) def lookup_revision_multiple(sha1_git_list) -> Iterator[Optional[Dict[str, Any]]]: """Return information about the revisions identified with their sha1_git identifiers. Args: sha1_git_list: A list of revision sha1_git identifiers Yields: revision information as dict if the revision exists, None otherwise. Raises: ValueError if the identifier provided is not of sha1 nature. """ sha1_bin_list = [_to_sha1_bin(sha1_git) for sha1_git in sha1_git_list] revisions = storage.revision_get(sha1_bin_list) for revision in revisions: if revision is not None: yield converters.from_revision(revision) else: yield None def lookup_revision_message(rev_sha1_git) -> Dict[str, bytes]: """Return the raw message of the revision with sha1 revision_sha1_git. Args: revision_sha1_git: The revision's sha1 as hexadecimal Returns: Decoded revision message as dict {'message': <the_message>} Raises: ValueError if the identifier provided is not of sha1 nature. NotFoundExc if the revision is not found, or if it has no message """ sha1_git_bin = _to_sha1_bin(rev_sha1_git) revision = storage.revision_get([sha1_git_bin])[0] if not revision: raise NotFoundExc(f"Revision with sha1_git {rev_sha1_git} not found.") if not revision.message: raise NotFoundExc(f"No message for revision with sha1_git {rev_sha1_git}.") return {"message": revision.message} def _lookup_revision_id_by(origin, branch_name, timestamp): def _get_snapshot_branch(snapshot, branch_name): snapshot = lookup_snapshot( visit["snapshot"], branches_from=branch_name, branches_count=10, branch_name_exclude_prefix=None, ) branch = None if branch_name in snapshot["branches"]: branch = snapshot["branches"][branch_name] return branch if isinstance(origin, int): origin = {"id": origin} elif isinstance(origin, str): origin = {"url": origin} else: raise TypeError('"origin" must be an int or a string.') from swh.web.common.origin_visits import get_origin_visit visit = get_origin_visit(origin, visit_ts=timestamp) branch = _get_snapshot_branch(visit["snapshot"], branch_name) rev_id = None if branch and branch["target_type"] == "revision": rev_id = branch["target"] elif branch and branch["target_type"] == "alias": branch = _get_snapshot_branch(visit["snapshot"], branch["target"]) if branch and branch["target_type"] == "revision": rev_id = branch["target"] if not rev_id: raise NotFoundExc( "Revision for origin %s and branch %s not found." % (origin.get("url"), branch_name) ) return rev_id def lookup_revision_by(origin, branch_name="HEAD", timestamp=None): """Lookup revision by origin, snapshot branch name and visit timestamp. If branch_name is not provided, lookup using 'HEAD' as default. If timestamp is not provided, use the most recent. Args: origin (Union[int,str]): origin of the revision branch_name (str): snapshot branch name timestamp (str/int): origin visit time frame Returns: dict: The revision matching the criterions Raises: NotFoundExc if no revision corresponds to the criterion """ rev_id = _lookup_revision_id_by(origin, branch_name, timestamp) return lookup_revision(rev_id) def lookup_revision_log(rev_sha1_git, limit): """Lookup revision log by revision id. Args: rev_sha1_git (str): The revision's sha1 as hexadecimal limit (int): the maximum number of revisions returned Returns: list: Revision log as list of revision dicts Raises: ValueError: if the identifier provided is not of sha1 nature. swh.web.common.exc.NotFoundExc: if there is no revision with the provided sha1_git. """ lookup_revision(rev_sha1_git) sha1_git_bin = _to_sha1_bin(rev_sha1_git) revision_entries = storage.revision_log([sha1_git_bin], limit=limit) return map(converters.from_revision, revision_entries) def lookup_revision_log_by(origin, branch_name, timestamp, limit): """Lookup revision by origin, snapshot branch name and visit timestamp. Args: origin (Union[int,str]): origin of the revision branch_name (str): snapshot branch timestamp (str/int): origin visit time frame limit (int): the maximum number of revisions returned Returns: list: Revision log as list of revision dicts Raises: swh.web.common.exc.NotFoundExc: if no revision corresponds to the criterion """ rev_id = _lookup_revision_id_by(origin, branch_name, timestamp) return lookup_revision_log(rev_id, limit) def lookup_revision_with_context_by( origin, branch_name, timestamp, sha1_git, limit=100 ): """Return information about revision sha1_git, limited to the sub-graph of all transitive parents of sha1_git_root. sha1_git_root being resolved through the lookup of a revision by origin, branch_name and ts. In other words, sha1_git is an ancestor of sha1_git_root. Args: - origin: origin of the revision. - branch_name: revision's branch. - timestamp: revision's time frame. - sha1_git: one of sha1_git_root's ancestors. - limit: limit the lookup to 100 revisions back. Returns: Pair of (root_revision, revision). Information on sha1_git if it is an ancestor of sha1_git_root including children leading to sha1_git_root Raises: - BadInputExc in case of unknown algo_hash or bad hash. - NotFoundExc if either revision is not found or if sha1_git is not an ancestor of sha1_git_root. """ rev_root_id = _lookup_revision_id_by(origin, branch_name, timestamp) rev_root_id_bin = hashutil.hash_to_bytes(rev_root_id) rev_root = storage.revision_get([rev_root_id_bin])[0] return ( converters.from_revision(rev_root) if rev_root else None, lookup_revision_with_context(rev_root, sha1_git, limit), ) def lookup_revision_with_context( sha1_git_root: Union[str, Dict[str, Any], Revision], sha1_git: str, limit: int = 100 ) -> Dict[str, Any]: """Return information about revision sha1_git, limited to the sub-graph of all transitive parents of sha1_git_root. In other words, sha1_git is an ancestor of sha1_git_root. Args: sha1_git_root: latest revision. The type is either a sha1 (as an hex string) or a non converted dict. sha1_git: one of sha1_git_root's ancestors limit: limit the lookup to 100 revisions back Returns: Information on sha1_git if it is an ancestor of sha1_git_root including children leading to sha1_git_root Raises: BadInputExc in case of unknown algo_hash or bad hash NotFoundExc if either revision is not found or if sha1_git is not an ancestor of sha1_git_root """ sha1_git_bin = _to_sha1_bin(sha1_git) revision = storage.revision_get([sha1_git_bin])[0] if not revision: raise NotFoundExc(f"Revision {sha1_git} not found") if isinstance(sha1_git_root, str): sha1_git_root_bin = _to_sha1_bin(sha1_git_root) revision_root = storage.revision_get([sha1_git_root_bin])[0] if not revision_root: raise NotFoundExc(f"Revision root {sha1_git_root} not found") elif isinstance(sha1_git_root, Revision): sha1_git_root_bin = sha1_git_root.id else: sha1_git_root_bin = sha1_git_root["id"] revision_log = storage.revision_log([sha1_git_root_bin], limit=limit) parents: Dict[str, List[str]] = {} children = defaultdict(list) for rev in revision_log: rev_id = rev["id"] parents[rev_id] = [] for parent_id in rev["parents"]: parents[rev_id].append(parent_id) children[parent_id].append(rev_id) if revision.id not in parents: raise NotFoundExc(f"Revision {sha1_git} is not an ancestor of {sha1_git_root}") revision_d = revision.to_dict() revision_d["children"] = children[revision.id] return converters.from_revision(revision_d) def lookup_directory_with_revision(sha1_git, dir_path=None, with_data=False): """Return information on directory pointed by revision with sha1_git. If dir_path is not provided, display top level directory. Otherwise, display the directory pointed by dir_path (if it exists). Args: sha1_git: revision's hash. dir_path: optional directory pointed to by that revision. with_data: boolean that indicates to retrieve the raw data if the path resolves to a content. Default to False (for the api) Returns: Information on the directory pointed to by that revision. Raises: BadInputExc in case of unknown algo_hash or bad hash. NotFoundExc either if the revision is not found or the path referenced does not exist. NotImplementedError in case of dir_path exists but do not reference a type 'dir' or 'file'. """ sha1_git_bin = _to_sha1_bin(sha1_git) revision = storage.revision_get([sha1_git_bin])[0] if not revision: raise NotFoundExc(f"Revision {sha1_git} not found") dir_sha1_git_bin = revision.directory if dir_path: paths = dir_path.strip(os.path.sep).split(os.path.sep) entity = storage.directory_entry_get_by_path( dir_sha1_git_bin, list(map(lambda p: p.encode("utf-8"), paths)) ) if not entity: raise NotFoundExc( "Directory or File '%s' pointed to by revision %s not found" % (dir_path, sha1_git) ) else: entity = {"type": "dir", "target": dir_sha1_git_bin} if entity["type"] == "dir": directory_entries = storage.directory_ls(entity["target"]) or [] return { "type": "dir", "path": "." if not dir_path else dir_path, "revision": sha1_git, "content": list(map(converters.from_directory_entry, directory_entries)), } elif entity["type"] == "file": # content content = _first_element(storage.content_find({"sha1_git": entity["target"]})) if not content: raise NotFoundExc(f"Content not found for revision {sha1_git}") content_d = content.to_dict() if with_data: data = storage.content_get_data(content.sha1) if data: content_d["data"] = data return { "type": "file", "path": "." if not dir_path else dir_path, "revision": sha1_git, "content": converters.from_content(content_d), } elif entity["type"] == "rev": # revision revision = storage.revision_get([entity["target"]])[0] return { "type": "rev", "path": "." if not dir_path else dir_path, "revision": sha1_git, "content": converters.from_revision(revision) if revision else None, } else: raise NotImplementedError("Entity of type %s not implemented." % entity["type"]) def lookup_content(q: str) -> Dict[str, Any]: """Lookup the content designed by q. Args: q: The release's sha1 as hexadecimal Raises: NotFoundExc if the requested content is not found """ algo, hash_ = query.parse_hash(q) c = _first_element(storage.content_find({algo: hash_})) if not c: hhex = hashutil.hash_to_hex(hash_) raise NotFoundExc(f"Content with {algo} checksum equals to {hhex} not found!") return converters.from_content(c.to_dict()) def lookup_content_raw(q: str) -> Dict[str, Any]: """Lookup the content defined by q. Args: q: query string of the form <hash_algo:hash> Returns: dict with 'sha1' and 'data' keys. data representing its raw data decoded. Raises: NotFoundExc if the requested content is not found or if the content bytes are not available in the storage """ c = lookup_content(q) content_sha1_bytes = hashutil.hash_to_bytes(c["checksums"]["sha1"]) content_data = storage.content_get_data(content_sha1_bytes) if content_data is None: algo, hash_ = query.parse_hash(q) raise NotFoundExc( f"Bytes of content with {algo} checksum equals " f"to {hashutil.hash_to_hex(hash_)} are not available!" ) return converters.from_content({"sha1": content_sha1_bytes, "data": content_data}) def stat_counters(): """Return the stat counters for Software Heritage Returns: A dict mapping textual labels to integer values. """ res = {} if counters and config.get_config()["counters_backend"] == "swh-counters": res = counters.get_counts( ["origin", "revision", "content", "directory", "release", "person"] ) else: res = storage.stat_counters() return res def _lookup_origin_visits( origin_url: str, last_visit: Optional[int] = None, limit: int = 10 ) -> Iterator[OriginVisit]: """Yields the origin origins' visits. Args: origin_url (str): origin to list visits for last_visit (int): last visit to lookup from limit (int): Number of elements max to display Yields: OriginVisit for that origin """ limit = min(limit, MAX_LIMIT) page_token: Optional[str] if last_visit is not None: page_token = str(last_visit) else: page_token = None visit_page = storage.origin_visit_get( origin_url, page_token=page_token, limit=limit ) yield from visit_page.results def lookup_origin_visits( origin: str, last_visit: Optional[int] = None, per_page: int = 10 ) -> Iterator[OriginVisitInfo]: """Yields the origin origins' visits. Args: origin: origin to list visits for Yields: Dictionaries of origin_visit for that origin """ for visit in _lookup_origin_visits(origin, last_visit=last_visit, limit=per_page): visit_status = storage.origin_visit_status_get_latest(origin, visit.visit) yield converters.from_origin_visit( {**visit_status.to_dict(), "type": visit.type} ) def lookup_origin_visit_latest( origin_url: str, require_snapshot: bool = False, type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, ) -> Optional[OriginVisitInfo]: """Return the origin's latest visit Args: origin_url: origin to list visits for type: Optional visit type to filter on (e.g git, tar, dsc, svn, hg, npm, pypi, ...) allowed_statuses: list of visit statuses considered to find the latest visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. require_snapshot: filter out origins without a snapshot Returns: The origin visit info as dict if found """ visit_status = origin_get_latest_visit_status( storage, origin_url, type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, ) return ( converters.from_origin_visit(visit_status.to_dict()) if visit_status else None ) def lookup_origin_visit(origin_url: str, visit_id: int) -> OriginVisitInfo: """Return information about visit visit_id with origin origin. Args: origin: origin concerned by the visit visit_id: the visit identifier to lookup Raises: NotFoundExc if no origin visit matching the criteria is found Returns: The dict origin_visit concerned """ visit = storage.origin_visit_get_by(origin_url, visit_id) visit_status = storage.origin_visit_status_get_latest(origin_url, visit_id) if not visit: raise NotFoundExc( f"Origin {origin_url} or its visit with id {visit_id} not found!" ) return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) def origin_visit_find_by_date( origin_url: str, visit_date: datetime.datetime, greater_or_equal: bool = True ) -> Optional[OriginVisitInfo]: """Retrieve origin visit status whose date is most recent than the provided visit_date. Args: origin_url: origin concerned by the visit visit_date: provided visit date greater_or_equal: ensure returned visit has a date greater or equal than the one passed as parameter Returns: The dict origin_visit_status matching the criteria if any. """ visit = storage.origin_visit_find_by_date(origin_url, visit_date) if greater_or_equal and visit and visit.date < visit_date: # when visit is anterior to the provided date, trying to find another one most # recent visits = storage.origin_visit_get( - origin_url, page_token=str(visit.visit), limit=1, + origin_url, + page_token=str(visit.visit), + limit=1, ).results visit = visits[0] if visits else None if not visit: return None visit_status = storage.origin_visit_status_get_latest(origin_url, visit.visit) return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) def lookup_snapshot_sizes( snapshot_id: str, branch_name_exclude_prefix: Optional[str] = "refs/pull/" ) -> Dict[str, int]: """Count the number of branches in the snapshot with the given id. Args: snapshot_id (str): sha1 identifier of the snapshot Returns: dict: A dict whose keys are the target types of branches and values their corresponding amount """ snapshot_id_bin = _to_sha1_bin(snapshot_id) snapshot_sizes = dict.fromkeys(("alias", "release", "revision"), 0) branch_counts = storage.snapshot_count_branches( snapshot_id_bin, branch_name_exclude_prefix.encode() if branch_name_exclude_prefix else None, ) # remove possible None key returned by snapshot_count_branches # when null branches are present in the snapshot branch_counts.pop(None, None) snapshot_sizes.update(branch_counts) return snapshot_sizes def lookup_snapshot( snapshot_id: str, branches_from: str = "", branches_count: int = 1000, target_types: Optional[List[str]] = None, branch_name_include_substring: Optional[str] = None, branch_name_exclude_prefix: Optional[str] = "refs/pull/", ) -> Dict[str, Any]: """Return information about a snapshot, aka the list of named branches found during a specific visit of an origin. Args: snapshot_id: sha1 identifier of the snapshot branches_from: optional parameter used to skip branches whose name is lesser than it before returning them branches_count: optional parameter used to restrain the amount of returned branches target_types: optional parameter used to filter the target types of branch to return (possible values that can be contained in that list are `'content', 'directory', 'revision', 'release', 'snapshot', 'alias'`) branch_name_include_substring: if provided, only return branches whose name contains given substring branch_name_exclude_prefix: if provided, do not return branches whose name starts with given pattern Raises: NotFoundExc if the given snapshot_id is missing Returns: A dict filled with the snapshot content. """ snapshot_id_bin = _to_sha1_bin(snapshot_id) if storage.snapshot_missing([snapshot_id_bin]): raise NotFoundExc(f"Snapshot with id {snapshot_id} not found!") partial_branches = storage.snapshot_get_branches( snapshot_id_bin, branches_from.encode(), branches_count, target_types, branch_name_include_substring.encode() if branch_name_include_substring else None, branch_name_exclude_prefix.encode() if branch_name_exclude_prefix else None, ) return ( converters.from_partial_branches(partial_branches) if partial_branches else None ) def lookup_latest_origin_snapshot( origin: str, allowed_statuses: List[str] = None ) -> Optional[Dict[str, Any]]: """Return information about the latest snapshot of an origin. .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. Args: origin: URL or integer identifier of the origin allowed_statuses: list of visit statuses considered to find the latest snapshot for the visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. Returns: A dict filled with the snapshot content. """ snp = snapshot_get_latest( storage, origin, allowed_statuses=allowed_statuses, branches_count=1000 ) return converters.from_snapshot(snp.to_dict()) if snp is not None else None def lookup_snapshot_alias( snapshot_id: str, alias_name: str ) -> Optional[Dict[str, Any]]: """Try to resolve a branch alias in a snapshot. Args: snapshot_id: hexadecimal representation of a snapshot id alias_name: name of the branch alias to resolve Returns: Target branch information or None if the alias does not exist or target a dangling branch. """ resolved_alias = snapshot_resolve_alias( storage, _to_sha1_bin(snapshot_id), alias_name.encode() ) return ( converters.from_swh(resolved_alias.to_dict(), hashess={"target"}) if resolved_alias is not None else None ) def lookup_revision_through(revision, limit=100): """Retrieve a revision from the criterion stored in revision dictionary. Args: revision: Dictionary of criterion to lookup the revision with. Here are the supported combination of possible values: - origin_url, branch_name, ts, sha1_git - origin_url, branch_name, ts - sha1_git_root, sha1_git - sha1_git Returns: None if the revision is not found or the actual revision. """ if ( "origin_url" in revision and "branch_name" in revision and "ts" in revision and "sha1_git" in revision ): return lookup_revision_with_context_by( revision["origin_url"], revision["branch_name"], revision["ts"], revision["sha1_git"], limit, ) if "origin_url" in revision and "branch_name" in revision and "ts" in revision: return lookup_revision_by( revision["origin_url"], revision["branch_name"], revision["ts"] ) if "sha1_git_root" in revision and "sha1_git" in revision: return lookup_revision_with_context( revision["sha1_git_root"], revision["sha1_git"], limit ) if "sha1_git" in revision: return lookup_revision(revision["sha1_git"]) # this should not happen raise NotImplementedError("Should not happen!") def lookup_directory_through_revision(revision, path=None, limit=100, with_data=False): """Retrieve the directory information from the revision. Args: revision: dictionary of criterion representing a revision to lookup path: directory's path to lookup. limit: optional query parameter to limit the revisions log (default to 100). For now, note that this limit could impede the transitivity conclusion about sha1_git not being an ancestor of. with_data: indicate to retrieve the content's raw data if path resolves to a content. Returns: The directory pointing to by the revision criterions at path. """ rev = lookup_revision_through(revision, limit) if not rev: raise NotFoundExc("Revision with criterion %s not found!" % revision) return (rev["id"], lookup_directory_with_revision(rev["id"], path, with_data)) def _vault_request(vault_fn, bundle_type: str, swhid: CoreSWHID, **kwargs): try: return vault_fn(bundle_type, swhid, **kwargs) except VaultNotFoundExc: return None def vault_cook(bundle_type: str, swhid: CoreSWHID, email=None): - """Cook a vault bundle. - """ + """Cook a vault bundle.""" return _vault_request(vault.cook, bundle_type, swhid, email=email) def vault_fetch(bundle_type: str, swhid: CoreSWHID): - """Fetch a vault bundle. - """ + """Fetch a vault bundle.""" return _vault_request(vault.fetch, bundle_type, swhid) def vault_progress(bundle_type: str, swhid: CoreSWHID): - """Get the current progress of a vault bundle. - """ + """Get the current progress of a vault bundle.""" return _vault_request(vault.progress, bundle_type, swhid) def diff_revision(rev_id): """Get the list of file changes (insertion / deletion / modification / renaming) for a particular revision. """ rev_sha1_git_bin = _to_sha1_bin(rev_id) changes = diff.diff_revision(storage, rev_sha1_git_bin, track_renaming=True) for change in changes: change["from"] = converters.from_directory_entry(change["from"]) change["to"] = converters.from_directory_entry(change["to"]) if change["from_path"]: change["from_path"] = change["from_path"].decode("utf-8") if change["to_path"]: change["to_path"] = change["to_path"].decode("utf-8") return changes class _RevisionsWalkerProxy(object): """ Proxy class wrapping a revisions walker iterator from swh-storage and performing needed conversions. """ def __init__(self, rev_walker_type, rev_start, *args, **kwargs): rev_start_bin = hashutil.hash_to_bytes(rev_start) self.revisions_walker = revisions_walker.get_revisions_walker( rev_walker_type, storage, rev_start_bin, *args, **kwargs ) def export_state(self): return self.revisions_walker.export_state() def __next__(self): return converters.from_revision(next(self.revisions_walker)) def __iter__(self): return self def get_revisions_walker(rev_walker_type, rev_start, *args, **kwargs): """ Utility function to instantiate a revisions walker of a given type, see :mod:`swh.storage.algos.revisions_walker`. Args: rev_walker_type (str): the type of revisions walker to return, possible values are: ``committer_date``, ``dfs``, ``dfs_post``, ``bfs`` and ``path`` rev_start (str): hexadecimal representation of a revision identifier args (list): position arguments to pass to the revisions walker constructor kwargs (dict): keyword arguments to pass to the revisions walker constructor """ # first check if the provided revision is valid lookup_revision(rev_start) return _RevisionsWalkerProxy(rev_walker_type, rev_start, *args, **kwargs) def lookup_object(object_type: ObjectType, object_id: str) -> Dict[str, Any]: """ Utility function for looking up an object in the archive by its type and id. Args: object_type (str): the type of object to lookup, either *content*, *directory*, *release*, *revision* or *snapshot* object_id (str): the *sha1_git* checksum identifier in hexadecimal form of the object to lookup Returns: Dict[str, Any]: A dictionary describing the object or a list of dictionary for the directory object type. Raises: swh.web.common.exc.NotFoundExc: if the object could not be found in the archive BadInputExc: if the object identifier is invalid """ if object_type == ObjectType.CONTENT: return lookup_content(f"sha1_git:{object_id}") elif object_type == ObjectType.DIRECTORY: return {"id": object_id, "content": list(lookup_directory(object_id))} elif object_type == ObjectType.RELEASE: return lookup_release(object_id) elif object_type == ObjectType.REVISION: return lookup_revision(object_id) elif object_type == ObjectType.SNAPSHOT: return lookup_snapshot(object_id) else: raise ValueError(f"Unexpected object type variant: {object_type}") def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[str]: """Lookup missing Software Heritage persistent identifier hash, using batch processing. Args: A dictionary with: keys: object types values: object hashes Returns: A set(hexadecimal) of the hashes not found in the storage """ missing_hashes = [] for obj_type, obj_ids in grouped_swhids.items(): if obj_type == ObjectType.CONTENT: missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids)) elif obj_type == ObjectType.DIRECTORY: missing_hashes.append(storage.directory_missing(obj_ids)) elif obj_type == ObjectType.REVISION: missing_hashes.append(storage.revision_missing(obj_ids)) elif obj_type == ObjectType.RELEASE: missing_hashes.append(storage.release_missing(obj_ids)) elif obj_type == ObjectType.SNAPSHOT: missing_hashes.append(storage.snapshot_missing(obj_ids)) missing = set( map(lambda x: hashutil.hash_to_hex(x), itertools.chain(*missing_hashes)) ) return missing def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]: """Lookup origins from the sha1 hash values of their URLs. Args: sha1s: list of sha1s hexadecimal representation Yields: origin information as dict """ sha1s_bytes = [hashutil.hash_to_bytes(sha1) for sha1 in sha1s] origins = storage.origin_get_by_sha1(sha1s_bytes) for origin in origins: yield converters.from_origin(origin) diff --git a/swh/web/common/converters.py b/swh/web/common/converters.py index 74a71d53..d44e0286 100644 --- a/swh/web/common/converters.py +++ b/swh/web/common/converters.py @@ -1,403 +1,384 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json from typing import Any, Dict, Union from django.core.serializers.json import DjangoJSONEncoder from swh.core.utils import decode_with_escape from swh.model import hashutil from swh.model.model import ( RawExtrinsicMetadata, Release, Revision, TimestampWithTimezone, ) from swh.model.swhids import ObjectType from swh.storage.interface import PartialBranches from swh.web.common.typing import OriginInfo, OriginVisitInfo def _group_checksums(data): """Groups checksums values computed from hash functions used in swh and stored in data dict under a single entry 'checksums' """ if data: checksums = {} for hash in hashutil.ALGORITHMS: if hash in data and data[hash]: checksums[hash] = data[hash] del data[hash] if len(checksums) > 0: data["checksums"] = checksums def fmap(f, data): """Map f to data at each level. This must keep the origin data structure type: - map -> map - dict -> dict - list -> list - None -> None Args: f: function that expects one argument. data: data to traverse to apply the f function. list, map, dict or bare value. Returns: The same data-structure with modified values by the f function. """ if data is None: return data if isinstance(data, map): return map(lambda y: fmap(f, y), (x for x in data)) if isinstance(data, list): return [fmap(f, x) for x in data] if isinstance(data, tuple): return tuple(fmap(f, x) for x in data) if isinstance(data, dict): return {k: fmap(f, v) for (k, v) in data.items()} return f(data) def from_swh( dict_swh, hashess={}, bytess={}, dates={}, blacklist={}, removables_if_empty={}, empty_dict={}, empty_list={}, convert={}, convert_fn=lambda x: x, ): """Convert from a swh dictionary to something reasonably json serializable. Args: dict_swh: the origin dictionary needed to be transformed hashess: list/set of keys representing hashes values (sha1, sha256, sha1_git, etc...) as bytes. Those need to be transformed in hexadecimal string bytess: list/set of keys representing bytes values which needs to be decoded blacklist: set of keys to filter out from the conversion convert: set of keys whose associated values need to be converted using convert_fn convert_fn: the conversion function to apply on the value of key in 'convert' The remaining keys are copied as is in the output. Returns: dictionary equivalent as dict_swh only with its keys converted. """ def convert_hashes_bytes(v): - """v is supposedly a hash as bytes, returns it converted in hex. - - """ + """v is supposedly a hash as bytes, returns it converted in hex.""" if isinstance(v, bytes): return hashutil.hash_to_hex(v) return v def convert_bytes(v): """v is supposedly a bytes string, decode as utf-8. FIXME: Improve decoding policy. If not utf-8, break! """ if isinstance(v, bytes): return v.decode("utf-8") return v def convert_date(v): """ Args: v (dict or datatime): either: - a dict with three keys: - timestamp (dict or integer timestamp) - offset_bytes - or, a datetime We convert it to a human-readable string """ if not v: return v if isinstance(v, datetime.datetime): return v.isoformat() v = v.copy() if isinstance(v["timestamp"], float): v["timestamp"] = int(v["timestamp"]) return TimestampWithTimezone.from_dict(v).to_datetime().isoformat() if not dict_swh: return dict_swh new_dict = {} for key, value in dict_swh.items(): if key in blacklist or (key in removables_if_empty and not value): continue if key in dates: new_dict[key] = convert_date(value) elif key in convert: new_dict[key] = convert_fn(value) elif isinstance(value, dict): new_dict[key] = from_swh( value, hashess=hashess, bytess=bytess, dates=dates, blacklist=blacklist, removables_if_empty=removables_if_empty, empty_dict=empty_dict, empty_list=empty_list, convert=convert, convert_fn=convert_fn, ) elif key in hashess: new_dict[key] = fmap(convert_hashes_bytes, value) elif key in bytess: try: new_dict[key] = fmap(convert_bytes, value) except UnicodeDecodeError: if "decoding_failures" not in new_dict: new_dict["decoding_failures"] = [key] else: new_dict["decoding_failures"].append(key) new_dict[key] = fmap(decode_with_escape, value) elif key in empty_dict and not value: new_dict[key] = {} elif key in empty_list and not value: new_dict[key] = [] else: new_dict[key] = value _group_checksums(new_dict) return new_dict def from_origin(origin: Dict[str, Any]) -> OriginInfo: - """Convert from a swh origin to an origin dictionary. - - """ + """Convert from a swh origin to an origin dictionary.""" return from_swh(origin, blacklist={"id"}) def from_release(release: Release) -> Dict[str, Any]: """Convert from a swh release to a json serializable release dictionary. Args: release: A release model object Returns: release dictionary with the following keys - id: hexadecimal sha1 (string) - revision: hexadecimal sha1 (string) - comment: release's comment message (string) - name: release's name (string) - author: release's author identifier (swh's id) - synthetic: the synthetic property (boolean) """ return from_swh( release.to_dict(), hashess={"id", "target"}, bytess={"message", "name", "fullname", "email"}, dates={"date"}, ) class SWHDjangoJSONEncoder(DjangoJSONEncoder): """Wrapper around DjangoJSONEncoder to serialize SWH-specific types found in :class:`swh.web.common.typing.SWHObjectInfo`.""" def default(self, o): if isinstance(o, ObjectType): return o.name.lower() else: super().default(o) class SWHMetadataEncoder(json.JSONEncoder): """Special json encoder for metadata field which can contain bytes encoded value. """ def default(self, obj): if isinstance(obj, bytes): try: return obj.decode("utf-8") except UnicodeDecodeError: # fallback to binary representation to avoid display errors return repr(obj) # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj) def convert_metadata(metadata): - """Convert json specific dict to a json serializable one. - - """ + """Convert json specific dict to a json serializable one.""" if metadata is None: return {} return json.loads(json.dumps(metadata, cls=SWHMetadataEncoder)) def from_revision(revision: Union[Dict[str, Any], Revision]) -> Dict[str, Any]: """Convert swh revision model object to a json serializable revision dictionary. Args: revision: revision model object Returns: dict: Revision dictionary with the same keys as inputs, except: - sha1s are in hexadecimal strings (id, directory) - bytes are decoded in string (author_name, committer_name, author_email, committer_email) Remaining keys are left as is """ if isinstance(revision, Revision): revision_d = revision.to_dict() else: revision_d = revision revision_d = from_swh( revision_d, hashess={"id", "directory", "parents", "children"}, bytess={"name", "fullname", "email", "extra_headers", "message"}, convert={"metadata"}, convert_fn=convert_metadata, dates={"date", "committer_date"}, ) if revision_d: if "parents" in revision_d: revision_d["merge"] = len(revision_d["parents"]) > 1 return revision_d def from_raw_extrinsic_metadata( metadata: Union[Dict[str, Any], RawExtrinsicMetadata] ) -> Dict[str, Any]: - """Convert RawExtrinsicMetadata model object to a json serializable dictionary. - """ + """Convert RawExtrinsicMetadata model object to a json serializable dictionary.""" return from_swh( metadata.to_dict() if isinstance(metadata, RawExtrinsicMetadata) else metadata, blacklist={"id", "metadata"}, dates={"discovery_date"}, ) def from_content(content): - """Convert swh content to serializable content dictionary. - - """ + """Convert swh content to serializable content dictionary.""" return from_swh( content, hashess={"sha1", "sha1_git", "sha256", "blake2s256"}, blacklist={"ctime"}, convert={"status"}, convert_fn=lambda v: "absent" if v == "hidden" else v, ) def from_person(person): - """Convert swh person to serializable person dictionary. - - """ + """Convert swh person to serializable person dictionary.""" return from_swh(person, bytess={"name", "fullname", "email"}) def from_origin_visit(visit: Dict[str, Any]) -> OriginVisitInfo: - """Convert swh origin_visit to serializable origin_visit dictionary. - - """ + """Convert swh origin_visit to serializable origin_visit dictionary.""" ov = from_swh( visit, hashess={"target", "snapshot"}, bytess={"branch"}, dates={"date"}, empty_dict={"metadata"}, ) return ov def from_snapshot(snapshot): - """Convert swh snapshot to serializable (partial) snapshot dictionary. - - """ + """Convert swh snapshot to serializable (partial) snapshot dictionary.""" sv = from_swh(snapshot, hashess={"id", "target"}, bytess={"next_branch"}) if sv and "branches" in sv: sv["branches"] = {decode_with_escape(k): v for k, v in sv["branches"].items()} for k, v in snapshot["branches"].items(): # alias target existing branch names, not a sha1 if v and v["target_type"] == "alias": branch = decode_with_escape(k) target = decode_with_escape(v["target"]) sv["branches"][branch]["target"] = target return sv def from_partial_branches(branches: PartialBranches): - """Convert PartialBranches to serializable partial snapshot dictionary - - """ + """Convert PartialBranches to serializable partial snapshot dictionary""" return from_snapshot( { "id": branches["id"], "branches": { branch_name: branch.to_dict() if branch else None for (branch_name, branch) in branches["branches"].items() }, "next_branch": branches["next_branch"], } ) def from_directory_entry(dir_entry): - """Convert swh directory to serializable directory dictionary. - - """ + """Convert swh directory to serializable directory dictionary.""" return from_swh( dir_entry, hashess={"dir_id", "sha1_git", "sha1", "sha256", "blake2s256", "target"}, bytess={"name"}, removables_if_empty={"sha1", "sha1_git", "sha256", "blake2s256", "status"}, convert={"status"}, convert_fn=lambda v: "absent" if v == "hidden" else v, ) def from_filetype(content_entry): """Convert swh content to serializable dictionary containing keys 'id', 'encoding', and 'mimetype'. """ return from_swh(content_entry, hashess={"id"}) diff --git a/swh/web/common/identifiers.py b/swh/web/common/identifiers.py index d3492909..debf4efd 100644 --- a/swh/web/common/identifiers.py +++ b/swh/web/common/identifiers.py @@ -1,388 +1,390 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Iterable, List, Optional from urllib.parse import quote, unquote from typing_extensions import TypedDict from django.http import QueryDict from swh.model.exceptions import ValidationError from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.swhids import ObjectType, QualifiedSWHID from swh.web.common import archive from swh.web.common.exc import BadInputExc from swh.web.common.typing import ( QueryParameters, SnapshotContext, SWHIDContext, SWHIDInfo, SWHObjectInfo, ) from swh.web.common.utils import reverse def parse_object_type(object_type: str) -> ObjectType: try: return ObjectType[object_type.upper()] except KeyError: valid_types = ", ".join(variant.name.lower() for variant in ObjectType) raise BadInputExc( f"Invalid swh object type! Valid types are {valid_types}; not {object_type}" ) def gen_swhid( object_type: ObjectType, object_id: str, scheme_version: int = 1, metadata: SWHIDContext = {}, ) -> str: """ Returns the SoftWare Heritage persistent IDentifier for a swh object based on: * the object type * the object id * the SWHID scheme version Args: object_type: the swh object type (content/directory/release/revision/snapshot) object_id: the swh object id (hexadecimal representation of its hash value) scheme_version: the scheme version of the SWHIDs Returns: the SWHID of the object Raises: BadInputExc: if the provided parameters do not enable to generate a valid identifier """ try: decoded_object_id = hash_to_bytes(object_id) obj_swhid = str( QualifiedSWHID( object_type=object_type, object_id=decoded_object_id, scheme_version=scheme_version, **metadata, ) ) except (ValidationError, KeyError, ValueError) as e: raise BadInputExc("Invalid object (%s) for SWHID. %s" % (object_id, e)) else: return obj_swhid class ResolvedSWHID(TypedDict): """parsed SWHID with context""" swhid_parsed: QualifiedSWHID """URL to browse object according to SWHID context""" browse_url: Optional[str] def resolve_swhid( swhid: str, query_params: Optional[QueryParameters] = None ) -> ResolvedSWHID: """ Try to resolve a SoftWare Heritage persistent IDentifier into an url for browsing the targeted object. Args: swhid: a SoftWare Heritage persistent IDentifier query_params: optional dict filled with query parameters to append to the browse url Returns: a dict with the following keys: * **swhid_parsed**: the parsed identifier * **browse_url**: the url for browsing the targeted object """ swhid_parsed = get_swhid(swhid) object_type = swhid_parsed.object_type object_id = swhid_parsed.object_id browse_url = None url_args = {} query_dict = QueryDict("", mutable=True) fragment = "" process_lines = object_type == ObjectType.CONTENT if query_params and len(query_params) > 0: for k in sorted(query_params.keys()): query_dict[k] = query_params[k] if swhid_parsed.origin: origin_url = unquote(swhid_parsed.origin) origin_url = archive.lookup_origin({"url": origin_url})["url"] query_dict["origin_url"] = origin_url if swhid_parsed.path and swhid_parsed.path != b"/": query_dict["path"] = swhid_parsed.path.decode("utf8", errors="replace") if swhid_parsed.anchor: directory = b"" if swhid_parsed.anchor.object_type == ObjectType.DIRECTORY: directory = swhid_parsed.anchor.object_id elif swhid_parsed.anchor.object_type == ObjectType.REVISION: revision = archive.lookup_revision( hash_to_hex(swhid_parsed.anchor.object_id) ) directory = revision["directory"] elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release["target_type"] == ObjectType.REVISION.name.lower(): revision = archive.lookup_revision(release["target"]) directory = revision["directory"] if object_type == ObjectType.CONTENT: if ( not swhid_parsed.origin and swhid_parsed.anchor.object_type != ObjectType.REVISION ): # when no origin or revision context, content objects need to have # their path prefixed by root directory id for breadcrumbs display query_dict["path"] = hash_to_hex(directory) + query_dict["path"] else: # remove leading slash from SWHID content path query_dict["path"] = query_dict["path"][1:] elif object_type == ObjectType.DIRECTORY: object_id = directory # remove leading and trailing slashes from SWHID directory path if query_dict["path"].endswith("/"): query_dict["path"] = query_dict["path"][1:-1] else: query_dict["path"] = query_dict["path"][1:] # snapshot context if swhid_parsed.visit: if swhid_parsed.visit.object_type != ObjectType.SNAPSHOT: raise BadInputExc("Visit must be a snapshot SWHID.") query_dict["snapshot"] = hash_to_hex(swhid_parsed.visit.object_id) if swhid_parsed.anchor: if ( swhid_parsed.anchor.object_type == ObjectType.REVISION and object_type != ObjectType.REVISION ): query_dict["revision"] = hash_to_hex(swhid_parsed.anchor.object_id) elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release: query_dict["release"] = release["name"] # browsing content or directory without snapshot context elif ( object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY) and swhid_parsed.anchor ): if swhid_parsed.anchor.object_type == ObjectType.REVISION: # anchor revision, objects are browsed from its view object_type = ObjectType.REVISION object_id = swhid_parsed.anchor.object_id elif ( object_type == ObjectType.DIRECTORY and swhid_parsed.anchor.object_type == ObjectType.DIRECTORY ): # a directory is browsed from its root object_id = swhid_parsed.anchor.object_id if object_type == ObjectType.CONTENT: url_args["query_string"] = f"sha1_git:{hash_to_hex(object_id)}" elif object_type in (ObjectType.DIRECTORY, ObjectType.RELEASE, ObjectType.REVISION): url_args["sha1_git"] = hash_to_hex(object_id) elif object_type == ObjectType.SNAPSHOT: url_args["snapshot_id"] = hash_to_hex(object_id) if swhid_parsed.lines and process_lines: lines = swhid_parsed.lines fragment += "#L" + str(lines[0]) if lines[1]: fragment += "-L" + str(lines[1]) if url_args: browse_url = ( reverse( f"browse-{object_type.name.lower()}", url_args=url_args, query_params=query_dict, ) + fragment ) return ResolvedSWHID(swhid_parsed=swhid_parsed, browse_url=browse_url) def get_swhid(swhid: str) -> QualifiedSWHID: """Check if a SWHID is valid and return it parsed. - Args: - swhid: a SoftWare Heritage persistent IDentifier. + Args: + swhid: a SoftWare Heritage persistent IDentifier. - Raises: - BadInputExc: if the provided SWHID can not be parsed. + Raises: + BadInputExc: if the provided SWHID can not be parsed. - Return: - A parsed SWHID. + Return: + A parsed SWHID. """ try: # ensure core part of SWHID is in lower case to avoid parsing error (core, sep, qualifiers) = swhid.partition(";") core = core.lower() return QualifiedSWHID.from_string(core + sep + qualifiers) except ValidationError as ve: raise BadInputExc("Error when parsing identifier: %s" % " ".join(ve.messages)) -def group_swhids(swhids: Iterable[QualifiedSWHID],) -> Dict[ObjectType, List[bytes]]: +def group_swhids( + swhids: Iterable[QualifiedSWHID], +) -> Dict[ObjectType, List[bytes]]: """ Groups many SoftWare Heritage persistent IDentifiers into a dictionary depending on their type. Args: swhids: an iterable of SoftWare Heritage persistent IDentifier objects Returns: A dictionary with: keys: object types values: object hashes """ swhids_by_type: Dict[ObjectType, List[bytes]] = { ObjectType.CONTENT: [], ObjectType.DIRECTORY: [], ObjectType.REVISION: [], ObjectType.RELEASE: [], ObjectType.SNAPSHOT: [], } for obj_swhid in swhids: obj_id = obj_swhid.object_id obj_type = obj_swhid.object_type swhids_by_type[obj_type].append(hash_to_bytes(obj_id)) return swhids_by_type def get_swhids_info( swh_objects: Iterable[SWHObjectInfo], snapshot_context: Optional[SnapshotContext] = None, extra_context: Optional[Dict[str, Any]] = None, ) -> List[SWHIDInfo]: """ Returns a list of dict containing info related to SWHIDs of objects. Args: swh_objects: an iterable of dict describing archived objects snapshot_context: optional dict parameter describing the snapshot in which the objects have been found extra_context: optional dict filled with extra contextual info about the objects Returns: a list of dict containing SWHIDs info """ swhids_info = [] for swh_object in swh_objects: if not swh_object["object_id"]: swhids_info.append( SWHIDInfo( object_type=swh_object["object_type"], object_id="", swhid="", swhid_url="", context={}, swhid_with_context=None, swhid_with_context_url=None, ) ) continue object_type = swh_object["object_type"] object_id = swh_object["object_id"] swhid_context: SWHIDContext = {} if snapshot_context: if snapshot_context["origin_info"] is not None: swhid_context["origin"] = quote( snapshot_context["origin_info"]["url"], safe="/?:@&" ) if object_type != ObjectType.SNAPSHOT: swhid_context["visit"] = gen_swhid( ObjectType.SNAPSHOT, snapshot_context["snapshot_id"] ) if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY): if snapshot_context["release_id"] is not None: swhid_context["anchor"] = gen_swhid( ObjectType.RELEASE, snapshot_context["release_id"] ) elif snapshot_context["revision_id"] is not None: swhid_context["anchor"] = gen_swhid( ObjectType.REVISION, snapshot_context["revision_id"] ) if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY): if ( extra_context and "revision" in extra_context and extra_context["revision"] and "anchor" not in swhid_context ): swhid_context["anchor"] = gen_swhid( ObjectType.REVISION, extra_context["revision"] ) elif ( extra_context and "root_directory" in extra_context and extra_context["root_directory"] and "anchor" not in swhid_context and ( object_type != ObjectType.DIRECTORY or extra_context["root_directory"] != object_id ) ): swhid_context["anchor"] = gen_swhid( ObjectType.DIRECTORY, extra_context["root_directory"] ) path = None if extra_context and "path" in extra_context: path = extra_context["path"] or "/" if "filename" in extra_context and object_type == ObjectType.CONTENT: path += extra_context["filename"] if object_type == ObjectType.DIRECTORY and path == "/": path = None if path: swhid_context["path"] = quote(path, safe="/?:@&") swhid = gen_swhid(object_type, object_id) swhid_url = reverse("browse-swhid", url_args={"swhid": swhid}) swhid_with_context = None swhid_with_context_url = None if swhid_context: swhid_with_context = gen_swhid( object_type, object_id, metadata=swhid_context ) swhid_with_context_url = reverse( "browse-swhid", url_args={"swhid": swhid_with_context} ) swhids_info.append( SWHIDInfo( object_type=object_type, object_id=object_id, swhid=swhid, swhid_url=swhid_url, context=swhid_context, swhid_with_context=swhid_with_context, swhid_with_context_url=swhid_with_context_url, ) ) return swhids_info diff --git a/swh/web/common/migrations/0001_initial.py b/swh/web/common/migrations/0001_initial.py index 0061efeb..30903eee 100644 --- a/swh/web/common/migrations/0001_initial.py +++ b/swh/web/common/migrations/0001_initial.py @@ -1,89 +1,96 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import unicode_literals from django.db import migrations, models _authorized_origins = [ "https://github.com/", "https://gitlab.com/", "https://bitbucket.org/", "https://git.code.sf.net/", "http://git.code.sf.net/", "https://hg.code.sf.net/", "http://hg.code.sf.net/", "https://svn.code.sf.net/", "http://svn.code.sf.net/", ] def _populate_save_authorized_origins(apps, schema_editor): SaveAuthorizedOrigin = apps.get_model("swh_web_common", "SaveAuthorizedOrigin") for origin_url in _authorized_origins: SaveAuthorizedOrigin.objects.create(url=origin_url) class Migration(migrations.Migration): initial = True operations = [ migrations.CreateModel( name="SaveAuthorizedOrigin", fields=[ ( "id", models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ("url", models.CharField(max_length=200)), ], - options={"db_table": "save_authorized_origin",}, + options={ + "db_table": "save_authorized_origin", + }, ), migrations.CreateModel( name="SaveOriginRequest", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("request_date", models.DateTimeField(auto_now_add=True)), ("origin_type", models.CharField(max_length=200)), ("origin_url", models.CharField(max_length=200)), ( "status", models.TextField( choices=[ ("accepted", "accepted"), ("rejected", "rejected"), ("pending", "pending"), ], default="pending", ), ), ("loading_task_id", models.IntegerField(default=-1)), ], - options={"db_table": "save_origin_request", "ordering": ["-id"],}, + options={ + "db_table": "save_origin_request", + "ordering": ["-id"], + }, ), migrations.CreateModel( name="SaveUnauthorizedOrigin", fields=[ ( "id", models.AutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ("url", models.CharField(max_length=200)), ], - options={"db_table": "save_unauthorized_origin",}, + options={ + "db_table": "save_unauthorized_origin", + }, ), migrations.RunPython(_populate_save_authorized_origins), ] diff --git a/swh/web/common/migrations/0011_saveoriginrequest_user_ids.py b/swh/web/common/migrations/0011_saveoriginrequest_user_ids.py index f08b181c..353c1790 100644 --- a/swh/web/common/migrations/0011_saveoriginrequest_user_ids.py +++ b/swh/web/common/migrations/0011_saveoriginrequest_user_ids.py @@ -1,22 +1,25 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("swh_web_common", "0010_saveoriginrequest_user_id"), ] operations = [ - migrations.RemoveField(model_name="saveoriginrequest", name="user_id",), + migrations.RemoveField( + model_name="saveoriginrequest", + name="user_id", + ), migrations.AddField( model_name="saveoriginrequest", name="user_ids", field=models.TextField(null=True), ), ] diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py index 759ce657..1a6d3647 100644 --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,925 +1,930 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from functools import lru_cache from itertools import product import json import logging from typing import Any, Dict, List, Optional, Tuple from prometheus_client import Gauge import requests import sentry_sdk from django.core.exceptions import ObjectDoesNotExist, ValidationError from django.core.validators import URLValidator from django.db.models import Q, QuerySet from django.utils.html import escape from swh.scheduler.utils import create_oneshot_task_dict from swh.web.common import archive from swh.web.common.exc import BadInputExc, ForbiddenExc, NotFoundExc from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.common.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc from swh.web.config import get_config, scheduler logger = logging.getLogger(__name__) # Number of days in the past to lookup for information MAX_THRESHOLD_DAYS = 30 # Non terminal visit statuses which needs updates NON_TERMINAL_STATUSES = [ VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING, ] def get_origin_save_authorized_urls() -> List[str]: """ Get the list of origin url prefixes authorized to be immediately loaded into the archive (whitelist). Returns: list: The list of authorized origin url prefix """ return [origin.url for origin in SaveAuthorizedOrigin.objects.all()] def get_origin_save_unauthorized_urls() -> List[str]: """ Get the list of origin url prefixes forbidden to be loaded into the archive (blacklist). Returns: list: the list of unauthorized origin url prefix """ return [origin.url for origin in SaveUnauthorizedOrigin.objects.all()] def can_save_origin(origin_url: str, bypass_pending_review: bool = False) -> str: """ Check if a software origin can be saved into the archive. Based on the origin url, the save request will be either: * immediately accepted if the url is whitelisted * rejected if the url is blacklisted * put in pending state for manual review otherwise Args: origin_url (str): the software origin url to check Returns: str: the origin save request status, either **accepted**, **rejected** or **pending** """ # origin url may be blacklisted for url_prefix in get_origin_save_unauthorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_REJECTED # if the origin url is in the white list, it can be immediately saved for url_prefix in get_origin_save_authorized_urls(): if origin_url.startswith(url_prefix): return SAVE_REQUEST_ACCEPTED # otherwise, the origin url needs to be manually verified if the user # that submitted it does not have special permission if bypass_pending_review: # mark the origin URL as trusted in that case SaveAuthorizedOrigin.objects.get_or_create(url=origin_url) return SAVE_REQUEST_ACCEPTED else: return SAVE_REQUEST_PENDING # map visit type to scheduler task # TODO: do not hardcode the task name here (T1157) _visit_type_task = { "git": "load-git", "hg": "load-hg", "svn": "load-svn", "cvs": "load-cvs", "bzr": "load-bzr", } _visit_type_task_privileged = { "archives": "load-archive-files", } # map scheduler task status to origin save status _save_task_status = { "next_run_not_scheduled": SAVE_TASK_NOT_YET_SCHEDULED, "next_run_scheduled": SAVE_TASK_SCHEDULED, "completed": SAVE_TASK_SUCCEEDED, "disabled": SAVE_TASK_FAILED, } # map scheduler task_run status to origin save status _save_task_run_status = { "scheduled": SAVE_TASK_SCHEDULED, "started": SAVE_TASK_RUNNING, "eventful": SAVE_TASK_SUCCEEDED, "uneventful": SAVE_TASK_SUCCEEDED, "failed": SAVE_TASK_FAILED, "permfailed": SAVE_TASK_FAILED, "lost": SAVE_TASK_FAILED, } @lru_cache() def get_scheduler_load_task_types() -> List[str]: task_types = scheduler().get_task_types() return [t["type"] for t in task_types if t["type"].startswith("load")] def get_savable_visit_types_dict(privileged_user: bool = False) -> Dict: """Returned the supported task types the user has access to. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the dict of supported visit types for the user """ if privileged_user: task_types = {**_visit_type_task, **_visit_type_task_privileged} else: task_types = _visit_type_task # filter visit types according to scheduler load task types if available try: load_task_types = get_scheduler_load_task_types() return {k: v for k, v in task_types.items() if v in load_task_types} except Exception: return task_types def get_savable_visit_types(privileged_user: bool = False) -> List[str]: """Return the list of visit types the user can perform save requests on. Args: privileged_user: Flag to determine if all visit types should be returned or not. Default to False to only list unprivileged visit types. Returns: the list of saveable visit types """ return sorted(list(get_savable_visit_types_dict(privileged_user).keys())) def _check_visit_type_savable(visit_type: str, privileged_user: bool = False) -> None: visit_type_tasks = get_savable_visit_types(privileged_user) if visit_type not in visit_type_tasks: allowed_visit_types = ", ".join(visit_type_tasks) raise BadInputExc( f"Visit of type {visit_type} can not be saved! " f"Allowed types are the following: {allowed_visit_types}" ) _validate_url = URLValidator( schemes=["http", "https", "svn", "git", "rsync", "pserver", "ssh", "bzr"] ) def _check_origin_url_valid(origin_url: str) -> None: try: _validate_url(origin_url) except ValidationError: raise BadInputExc( "The provided origin url (%s) is not valid!" % escape(origin_url) ) def origin_exists(origin_url: str) -> OriginExistenceCheckInfo: """Check the origin url for existence. If it exists, extract some more useful information on the origin. """ resp = requests.head(origin_url, allow_redirects=True) exists = resp.ok content_length: Optional[int] = None last_modified: Optional[str] = None if exists: # Also process X-Archive-Orig-* headers in case the URL targets the # Internet Archive. size_ = resp.headers.get( "Content-Length", resp.headers.get("X-Archive-Orig-Content-Length") ) content_length = int(size_) if size_ else None try: date_str = resp.headers.get( "Last-Modified", resp.headers.get("X-Archive-Orig-Last-Modified", "") ) date = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z") last_modified = date.isoformat() except ValueError: # if not provided or not parsable as per the expected format, keep it None pass return OriginExistenceCheckInfo( origin_url=origin_url, exists=exists, last_modified=last_modified, content_length=content_length, ) def _check_origin_exists(url: str) -> OriginExistenceCheckInfo: """Ensure an URL exists, if not raise an explicit message.""" metadata = origin_exists(url) if not metadata["exists"]: raise BadInputExc(f"The provided url ({escape(url)}) does not exist!") return metadata def _get_visit_info_for_save_request( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str]]: """Retrieve visit information out of a save request Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (visit date, optional visit status) for such save request origin """ visit_date = None visit_status = None time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # stop trying to find a visit date one month after save request submission # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: origin = save_request.origin_url ovs = archive.origin_visit_find_by_date(origin, save_request.request_date) if ovs: visit_date = parse_iso8601_date_to_utc(ovs["date"]) visit_status = ovs["status"] return visit_date, visit_status def _check_visit_update_status( save_request: SaveOriginRequest, ) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: Tuple of (optional visit date, optional visit status, optional save task status) for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: loading_task_status = SAVE_TASK_FAILED return visit_date, visit_status, loading_task_status def _compute_task_loading_status( - task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, + task: Optional[Dict[str, Any]] = None, + task_run: Optional[Dict[str, Any]] = None, ) -> Optional[str]: loading_task_status: Optional[str] = None # First determine the loading task status out of task information if task: loading_task_status = _save_task_status[task["status"]] if task_run: loading_task_status = _save_task_run_status[task_run["status"]] return loading_task_status def _update_save_request_info( save_request: SaveOriginRequest, task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> SaveOriginRequestInfo: """Update save request information out of the visit status and fallback to the task and task_run information if the visit status is missing. Args: save_request: Save request task: Associated scheduler task information about the save request task_run: Most recent run occurrence of the associated task Returns: Summary of the save request information updated. """ must_save = False # To determine the save code now request's final status, the visit date must be set # and the visit status must be a final one. Once they do, the save code now is # definitely done. if ( not save_request.visit_date or not save_request.visit_status or save_request.visit_status in NON_TERMINAL_STATUSES ): visit_date, visit_status, loading_task_status = _check_visit_update_status( save_request ) if not loading_task_status: # fallback when not provided loading_task_status = _compute_task_loading_status(task, task_run) if visit_date != save_request.visit_date: must_save = True save_request.visit_date = visit_date if visit_status != save_request.visit_status: must_save = True save_request.visit_status = visit_status if ( loading_task_status is not None and loading_task_status != save_request.loading_task_status ): must_save = True save_request.loading_task_status = loading_task_status if must_save: save_request.save() return save_request.to_dict() def create_save_origin_request( visit_type: str, origin_url: str, privileged_user: bool = False, user_id: Optional[int] = None, **kwargs, ) -> SaveOriginRequestInfo: """Create a loading task to save a software origin into the archive. This function aims to create a software origin loading task through the use of the swh-scheduler component. First, some checks are performed to see if the visit type and origin url are valid but also if the the save request can be accepted. For the 'archives' visit type, this also ensures the artifacts actually exists. If those checks passed, the loading task is then created. Otherwise, the save request is put in pending or rejected state. All the submitted save requests are logged into the swh-web database to keep track of them. Args: visit_type: the type of visit to perform (e.g. git, hg, svn, archives, ...) origin_url: the url of the origin to save privileged: Whether the user has some more privilege than other (bypass review, access to privileged other visit types) user_id: User identifier (provided when authenticated) kwargs: Optional parameters (e.g. artifact_url, artifact_filename, artifact_version) Raises: BadInputExc: the visit type or origin url is invalid or inexistent ForbiddenExc: the provided origin url is blacklisted Returns: dict: A dict describing the save request with the following keys: * **visit_type**: the type of visit to perform * **origin_url**: the url of the origin * **save_request_date**: the date the request was submitted * **save_request_status**: the request status, either **accepted**, **rejected** or **pending** * **save_task_status**: the origin loading task status, either **not created**, **not yet scheduled**, **scheduled**, **succeed** or **failed** """ visit_type_tasks = get_savable_visit_types_dict(privileged_user) _check_visit_type_savable(visit_type, privileged_user) _check_origin_url_valid(origin_url) # if all checks passed so far, we can try and save the origin save_request_status = can_save_origin(origin_url, privileged_user) task = None # if the origin save request is accepted, create a scheduler # task to load it into the archive if save_request_status == SAVE_REQUEST_ACCEPTED: # create a task with high priority task_kwargs: Dict[str, Any] = { "priority": "high", "url": origin_url, } if visit_type == "archives": # extra arguments for that type are required archives_data = kwargs.get("archives_data", []) if not archives_data: raise BadInputExc( "Artifacts data are missing for the archives visit type." ) artifacts = [] for artifact in archives_data: artifact_url = artifact.get("artifact_url") artifact_version = artifact.get("artifact_version") if not artifact_url or not artifact_version: raise BadInputExc("Missing url or version for an artifact to load.") metadata = _check_origin_exists(artifact_url) artifacts.append( { "url": artifact_url, "version": artifact_version, "time": metadata["last_modified"], "length": metadata["content_length"], } ) task_kwargs = dict(**task_kwargs, artifacts=artifacts, snapshot_append=True) sor = None # get list of previously submitted save requests (most recent first) current_sors = list( SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ).order_by("-request_date") ) can_create_task = False # if no save requests previously submitted, create the scheduler task if not current_sors: can_create_task = True else: # get the latest submitted save request sor = current_sors[0] # if it was in pending state, we need to create the scheduler task # and update the save request info in the database if sor.status == SAVE_REQUEST_PENDING: can_create_task = True # a task has already been created to load the origin elif sor.loading_task_id != -1: # get the scheduler task and its status tasks = scheduler().get_tasks([sor.loading_task_id]) task = tasks[0] if tasks else None task_runs = scheduler().get_task_runs([sor.loading_task_id]) task_run = task_runs[0] if task_runs else None save_request_info = _update_save_request_info(sor, task, task_run) task_status = save_request_info["save_task_status"] # create a new scheduler task only if the previous one has been # already executed if ( task_status == SAVE_TASK_FAILED or task_status == SAVE_TASK_SUCCEEDED ): can_create_task = True sor = None else: can_create_task = False if can_create_task: # effectively create the scheduler task task_dict = create_oneshot_task_dict( visit_type_tasks[visit_type], **task_kwargs ) task = scheduler().create_tasks([task_dict])[0] # pending save request has been accepted if sor: sor.status = SAVE_REQUEST_ACCEPTED sor.loading_task_id = task["id"] sor.save() else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, loading_task_id=task["id"], user_ids=f'"{user_id}"' if user_id else None, ) # save request must be manually reviewed for acceptation elif save_request_status == SAVE_REQUEST_PENDING: # check if there is already such a save request already submitted, # no need to add it to the database in that case try: sor = SaveOriginRequest.objects.get( visit_type=visit_type, origin_url=origin_url, status=save_request_status ) user_ids = sor.user_ids if sor.user_ids is not None else "" if user_id is not None and f'"{user_id}"' not in user_ids: # update user ids list sor.user_ids = f'{sor.user_ids},"{user_id}"' sor.save() # if not add it to the database except ObjectDoesNotExist: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, ) # origin can not be saved as its url is blacklisted, # log the request to the database anyway else: sor = SaveOriginRequest.objects.create( visit_type=visit_type, origin_url=origin_url, status=save_request_status, user_ids=f'"{user_id}"' if user_id else None, ) if save_request_status == SAVE_REQUEST_REJECTED: raise ForbiddenExc( ( 'The "save code now" request has been rejected ' "because the provided origin url is blacklisted." ) ) assert sor is not None return _update_save_request_info(sor, task) def update_save_origin_requests_from_queryset( requests_queryset: QuerySet, ) -> List[SaveOriginRequestInfo]: """Update all save requests from a SaveOriginRequest queryset, update their status in db and return the list of impacted save_requests. Args: requests_queryset: input SaveOriginRequest queryset Returns: list: A list of save origin request info dicts as described in :func:`swh.web.common.origin_save.create_save_origin_request` """ task_ids = [] for sor in requests_queryset: task_ids.append(sor.loading_task_id) save_requests = [] if task_ids: try: tasks = scheduler().get_tasks(task_ids) tasks = {task["id"]: task for task in tasks} task_runs = scheduler().get_task_runs(tasks) task_runs = {task_run["task"]: task_run for task_run in task_runs} except Exception: # allow to avoid mocking api GET responses for /origin/save endpoint when # running cypress tests as scheduler is not available tasks = {} task_runs = {} for sor in requests_queryset: sr_dict = _update_save_request_info( - sor, tasks.get(sor.loading_task_id), task_runs.get(sor.loading_task_id), + sor, + tasks.get(sor.loading_task_id), + task_runs.get(sor.loading_task_id), ) save_requests.append(sr_dict) return save_requests def refresh_save_origin_request_statuses() -> List[SaveOriginRequestInfo]: """Refresh non-terminal save origin requests (SOR) in the backend. Non-terminal SOR are requests whose status is **accepted** and their task status are either **created**, **not yet scheduled**, **scheduled** or **running**. This shall compute this list of SOR, checks their status in the scheduler and optionally elasticsearch for their current status. Then update those in db. Finally, this returns the refreshed information on those SOR. """ pivot_date = datetime.now(tz=timezone.utc) - timedelta(days=MAX_THRESHOLD_DAYS) save_requests = SaveOriginRequest.objects.filter( # Retrieve accepted request statuses (all statuses) Q(status=SAVE_REQUEST_ACCEPTED), # those without the required information we need to update Q(visit_date__isnull=True) | Q(visit_status__isnull=True) | Q(visit_status__in=NON_TERMINAL_STATUSES), # limit results to recent ones (that is roughly 30 days old at best) Q(request_date__gte=pivot_date), ) return ( update_save_origin_requests_from_queryset(save_requests) if save_requests.count() > 0 else [] ) def get_save_origin_requests( visit_type: str, origin_url: str ) -> List[SaveOriginRequestInfo]: """ Get all save requests for a given software origin. Args: visit_type: the type of visit origin_url: the url of the origin Raises: BadInputExc: the visit type or origin url is invalid swh.web.common.exc.NotFoundExc: no save requests can be found for the given origin Returns: list: A list of save origin requests dict as described in :func:`swh.web.common.origin_save.create_save_origin_request` """ _check_visit_type_savable(visit_type) _check_origin_url_valid(origin_url) sors = SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url ) if sors.count() == 0: raise NotFoundExc( f"No save requests found for visit of type {visit_type} " f"on origin with url {origin_url}." ) return update_save_origin_requests_from_queryset(sors) def get_save_origin_task_info( save_request_id: int, full_info: bool = True ) -> Dict[str, Any]: """ Get detailed information about an accepted save origin request and its associated loading task. If the associated loading task info is archived and removed from the scheduler database, returns an empty dictionary. Args: save_request_id: identifier of a save origin request full_info: whether to return detailed info for staff users Returns: A dictionary with the following keys: - **type**: loading task type - **arguments**: loading task arguments - **id**: loading task database identifier - **backend_id**: loading task celery identifier - **scheduled**: loading task scheduling date - **ended**: loading task termination date - **status**: loading task execution status - **visit_status**: Actual visit status Depending on the availability of the task logs in the elasticsearch cluster of Software Heritage, the returned dictionary may also contain the following keys: - **name**: associated celery task name - **message**: relevant log message from task execution - **duration**: task execution time (only if it succeeded) - **worker**: name of the worker that executed the task """ try: save_request = SaveOriginRequest.objects.get(id=save_request_id) except ObjectDoesNotExist: return {} task_info: Dict[str, Any] = {} if save_request.note is not None: task_info["note"] = save_request.note try: task = scheduler().get_tasks([save_request.loading_task_id]) except Exception: # to avoid mocking GET responses of /save/task/info/ endpoint when running # cypress tests as scheduler is not available in that case task = None task = task[0] if task else None if task is None: return task_info task_run = scheduler().get_task_runs([task["id"]]) task_run = task_run[0] if task_run else None if task_run is None: return task_info task_info.update(task_run) task_info["type"] = task["type"] task_info["arguments"] = task["arguments"] task_info["id"] = task_run["task"] del task_info["task"] del task_info["metadata"] # Enrich the task info with the loading visit status task_info["visit_status"] = save_request.visit_status es_workers_index_url = get_config()["es_workers_index_url"] if not es_workers_index_url: return task_info es_workers_index_url += "/_search" if save_request.visit_date: min_ts = save_request.visit_date max_ts = min_ts + timedelta(days=7) else: min_ts = save_request.request_date max_ts = min_ts + timedelta(days=MAX_THRESHOLD_DAYS) min_ts_unix = int(min_ts.timestamp()) * 1000 max_ts_unix = int(max_ts.timestamp()) * 1000 save_task_status = _save_task_status[task["status"]] priority = "3" if save_task_status == SAVE_TASK_FAILED else "6" query = { "bool": { "must": [ {"match_phrase": {"syslog.priority": {"query": priority}}}, { "match_phrase": { "journald.custom.swh_task_id": {"query": task_run["backend_id"]} } }, { "range": { "@timestamp": { "gte": min_ts_unix, "lte": max_ts_unix, "format": "epoch_millis", } } }, ] } } try: response = requests.post( es_workers_index_url, json={"query": query, "sort": ["@timestamp"]}, timeout=30, ) results = json.loads(response.text) if results["hits"]["total"]["value"] >= 1: task_run_info = results["hits"]["hits"][-1]["_source"] journald_custom = task_run_info.get("journald", {}).get("custom", {}) task_info["duration"] = journald_custom.get( "swh_logging_args_runtime", "not available" ) task_info["message"] = task_run_info.get("message", "not available") task_info["name"] = journald_custom.get("swh_task_name", "not available") task_info["worker"] = task_run_info.get("host", {}).get("hostname") except Exception as exc: logger.warning("Request to Elasticsearch failed\n%s", exc) sentry_sdk.capture_exception(exc) if not full_info: for field in ("id", "backend_id", "worker"): # remove some staff only fields task_info.pop(field, None) if "message" in task_run and "Loading failure" in task_run["message"]: # hide traceback for non staff users, only display exception message_lines = task_info["message"].split("\n") message = "" for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] task_info["message"] = message return task_info SUBMITTED_SAVE_REQUESTS_METRIC = "swh_web_submitted_save_requests" _submitted_save_requests_gauge = Gauge( name=SUBMITTED_SAVE_REQUESTS_METRIC, documentation="Number of submitted origin save requests", labelnames=["status", "visit_type"], registry=SWH_WEB_METRICS_REGISTRY, ) ACCEPTED_SAVE_REQUESTS_METRIC = "swh_web_accepted_save_requests" _accepted_save_requests_gauge = Gauge( name=ACCEPTED_SAVE_REQUESTS_METRIC, documentation="Number of accepted origin save requests", labelnames=["load_task_status", "visit_type"], registry=SWH_WEB_METRICS_REGISTRY, ) # Metric on the delay of save code now request per status and visit_type. This is the # time difference between the save code now is requested and the time it got ingested. ACCEPTED_SAVE_REQUESTS_DELAY_METRIC = "swh_web_save_requests_delay_seconds" _accepted_save_requests_delay_gauge = Gauge( name=ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, documentation="Save Requests Duration", labelnames=["load_task_status", "visit_type"], registry=SWH_WEB_METRICS_REGISTRY, ) def compute_save_requests_metrics() -> None: """Compute Prometheus metrics related to origin save requests: - Number of submitted origin save requests - Number of accepted origin save requests - Save Code Now requests delay between request time and actual time of ingestion """ request_statuses = ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING, ) load_task_statuses = ( SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, ) # for metrics, we want access to all visit types visit_types = get_savable_visit_types(privileged_user=True) labels_set = product(request_statuses, visit_types) for labels in labels_set: _submitted_save_requests_gauge.labels(*labels).set(0) labels_set = product(load_task_statuses, visit_types) for labels in labels_set: _accepted_save_requests_gauge.labels(*labels).set(0) duration_load_task_statuses = ( SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED, ) for labels in product(duration_load_task_statuses, visit_types): _accepted_save_requests_delay_gauge.labels(*labels).set(0) for sor in SaveOriginRequest.objects.all(): if sor.status == SAVE_REQUEST_ACCEPTED: _accepted_save_requests_gauge.labels( - load_task_status=sor.loading_task_status, visit_type=sor.visit_type, + load_task_status=sor.loading_task_status, + visit_type=sor.visit_type, ).inc() _submitted_save_requests_gauge.labels( status=sor.status, visit_type=sor.visit_type ).inc() if ( sor.loading_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED) and sor.visit_date is not None and sor.request_date is not None ): delay = sor.visit_date.timestamp() - sor.request_date.timestamp() _accepted_save_requests_delay_gauge.labels( - load_task_status=sor.loading_task_status, visit_type=sor.visit_type, + load_task_status=sor.loading_task_status, + visit_type=sor.visit_type, ).inc(delay) diff --git a/swh/web/common/swh_templatetags.py b/swh/web/common/swh_templatetags.py index 627b2050..66deffae 100644 --- a/swh/web/common/swh_templatetags.py +++ b/swh/web/common/swh_templatetags.py @@ -1,149 +1,149 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import re from django import template from django.utils.safestring import mark_safe from swh.web.common.converters import SWHDjangoJSONEncoder from swh.web.common.origin_save import get_savable_visit_types from swh.web.common.utils import rst_to_html register = template.Library() @register.filter def docstring_display(docstring): """ Utility function to htmlize reST-formatted documentation in browsable api. """ return rst_to_html(docstring) @register.filter def urlize_links_and_mails(text): """Utility function for decorating api links in browsable api. Args: text: whose content matching links should be transformed into contextual API or Browse html links. Returns The text transformed if any link is found. The text as is otherwise. """ if 'href="' not in text: text = re.sub(r"(http.*)", r'<a href="\1">\1</a>', text) return re.sub(r'([^ <>"]+@[^ <>"]+)', r'<a href="mailto:\1">\1</a>', text) return text @register.filter def urlize_header_links(text): """Utility function for decorating headers links in browsable api. Args text: Text whose content contains Link header value Returns: The text transformed with html link if any link is found. The text as is otherwise. """ ret = re.sub( r'<(http[^<>]+)>; rel="([^,]+)"', r'<<a href="\1">\1</a>>; rel="\2"\n', text ).replace("\n,", "\n") return ret[:-1] @register.filter def jsonify(obj): """Utility function for converting a django template variable to JSON in order to use it in script tags. Args obj: Any django template context variable Returns: JSON representation of the variable. """ return mark_safe(json.dumps(obj, cls=SWHDjangoJSONEncoder)) @register.filter def sub(value, arg): """Django template filter for subtracting two numbers Args: value (int/float): the value to subtract from arg (int/float): the value to subtract to Returns: int/float: The subtraction result """ return value - arg @register.filter def mul(value, arg): """Django template filter for multiplying two numbers Args: value (int/float): the value to multiply from arg (int/float): the value to multiply with Returns: int/float: The multiplication result """ return value * arg @register.filter def key_value(dict, key): """Django template filter to get a value in a dictionary. - Args: - dict (dict): a dictionary - key (str): the key to lookup value + Args: + dict (dict): a dictionary + key (str): the key to lookup value - Returns: - The requested value in the dictionary + Returns: + The requested value in the dictionary """ return dict[key] @register.filter def visit_type_savable(visit_type: str) -> bool: """Django template filter to check if a save request can be created for a given visit type. Args: visit_type: the type of visit Returns: If the visit type is saveable or not """ return visit_type in get_savable_visit_types() @register.filter def split(value, arg): """Django template filter to split a string. - Args: - value (str): the string to split - arg (str): the split separator + Args: + value (str): the string to split + arg (str): the split separator - Returns: - list: the split string parts + Returns: + list: the split string parts """ return value.split(arg) diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py index 6b8b1ae8..1853f6a4 100644 --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -1,600 +1,602 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import functools import os import re from typing import Any, Callable, Dict, List, Optional import urllib.parse from xml.etree import ElementTree from bs4 import BeautifulSoup from docutils.core import publish_parts import docutils.parsers.rst import docutils.utils from docutils.writers.html5_polyglot import HTMLTranslator, Writer from iso8601 import ParseError, parse_date from pkg_resources import get_distribution from prometheus_client.registry import CollectorRegistry import requests from requests.auth import HTTPBasicAuth import sentry_sdk from django.core.cache import cache from django.core.cache.backends.base import DEFAULT_TIMEOUT from django.http import HttpRequest, QueryDict from django.shortcuts import redirect from django.urls import resolve from django.urls import reverse as django_reverse from swh.web.auth.utils import ( ADD_FORGE_MODERATOR_PERMISSION, ADMIN_LIST_DEPOSIT_PERMISSION, MAILMAP_ADMIN_PERMISSION, ) from swh.web.common.exc import BadInputExc from swh.web.common.typing import QueryParameters from swh.web.config import SWH_WEB_SERVER_NAME, get_config, search SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True) swh_object_icons = { "alias": "mdi mdi-star", "branch": "mdi mdi-source-branch", "branches": "mdi mdi-source-branch", "content": "mdi mdi-file-document", "cnt": "mdi mdi-file-document", "directory": "mdi mdi-folder", "dir": "mdi mdi-folder", "origin": "mdi mdi-source-repository", "ori": "mdi mdi-source-repository", "person": "mdi mdi-account", "revisions history": "mdi mdi-history", "release": "mdi mdi-tag", "rel": "mdi mdi-tag", "releases": "mdi mdi-tag", "revision": "mdi mdi-rotate-90 mdi-source-commit", "rev": "mdi mdi-rotate-90 mdi-source-commit", "snapshot": "mdi mdi-camera", "snp": "mdi mdi-camera", "visits": "mdi mdi-calendar-month", } def reverse( viewname: str, url_args: Optional[Dict[str, Any]] = None, query_params: Optional[QueryParameters] = None, current_app: Optional[str] = None, urlconf: Optional[str] = None, request: Optional[HttpRequest] = None, ) -> str: """An override of django reverse function supporting query parameters. Args: viewname: the name of the django view from which to compute a url url_args: dictionary of url arguments indexed by their names query_params: dictionary of query parameters to append to the reversed url current_app: the name of the django app tighten to the view urlconf: url configuration module request: build an absolute URI if provided Returns: str: the url of the requested view with processed arguments and query parameters """ if url_args: url_args = {k: v for k, v in url_args.items() if v is not None} url = django_reverse( viewname, urlconf=urlconf, kwargs=url_args, current_app=current_app ) if query_params: query_params = {k: v for k, v in query_params.items() if v is not None} if query_params and len(query_params) > 0: query_dict = QueryDict("", mutable=True) for k in sorted(query_params.keys()): query_dict[k] = query_params[k] url += "?" + query_dict.urlencode(safe="/;:") if request is not None: url = request.build_absolute_uri(url) return url def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datetime: datetime in UTC without timezone info """ if date.tzinfo and date.tzinfo != timezone.utc: return date.astimezone(tz=timezone.utc) else: return date def parse_iso8601_date_to_utc(iso_date: str) -> datetime: """Given an ISO 8601 datetime string, parse the result as UTC datetime. Returns: a timezone-aware datetime representing the parsed date Raises: swh.web.common.exc.BadInputExc: provided date does not respect ISO 8601 format Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - 2007-01-14T20:34:22Z """ try: date = parse_date(iso_date) return datetime_to_utc(date) except ParseError as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r"([0-9a-f]{8})[0-9a-z]{56}" sha1_re = r"([0-9a-f]{8})[0-9a-f]{32}" ret = re.sub(sha256_re, r"\1...", path) return re.sub(sha1_re, r"\1...", ret) def format_utc_iso_date(iso_date, fmt="%d %B %Y, %H:%M UTC"): """Turns a string representation of an ISO 8601 datetime string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: str: a formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_iso8601_date_to_utc(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: list: a list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip("/").split("/") path_from_root = "" for p in sub_paths: path_from_root += "/" + p path_info.append({"name": p, "path": path_from_root.strip("/")}) return path_info def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components ).get_default_values() settings.report_level = report_level document = docutils.utils.new_document("rst-doc", settings=settings) parser.parse(text, document) return document def get_client_ip(request): """ Return the client IP address from an incoming HTTP request. Args: request (django.http.HttpRequest): the incoming HTTP request Returns: str: The client IP address """ x_forwarded_for = request.META.get("HTTP_X_FORWARDED_FOR") if x_forwarded_for: ip = x_forwarded_for.split(",")[0] else: ip = request.META.get("REMOTE_ADDR") return ip def is_swh_web_development(request: HttpRequest) -> bool: - """Indicate if we are running a development version of swh-web. - """ + """Indicate if we are running a development version of swh-web.""" site_base_url = request.build_absolute_uri("/") return any( host in site_base_url for host in ("localhost", "127.0.0.1", "testserver") ) def is_swh_web_staging(request: HttpRequest) -> bool: - """Indicate if we are running a staging version of swh-web. - """ + """Indicate if we are running a staging version of swh-web.""" config = get_config() site_base_url = request.build_absolute_uri("/") return any( server_name in site_base_url for server_name in config["staging_server_names"] ) def is_swh_web_production(request: HttpRequest) -> bool: - """Indicate if we are running the public production version of swh-web. - """ + """Indicate if we are running the public production version of swh-web.""" return SWH_WEB_SERVER_NAME in request.build_absolute_uri("/") browsers_supported_image_mimes = set( [ "image/gif", "image/png", "image/jpeg", "image/bmp", "image/webp", "image/svg", "image/svg+xml", ] ) def context_processor(request): """ Django context processor used to inject variables in all swh-web templates. """ config = get_config() if ( hasattr(request, "user") and request.user.is_authenticated and not hasattr(request.user, "backend") ): # To avoid django.template.base.VariableDoesNotExist errors # when rendering templates when standard Django user is logged in. request.user.backend = "django.contrib.auth.backends.ModelBackend" return { "swh_object_icons": swh_object_icons, "available_languages": None, "swh_client_config": config["client_config"], "oidc_enabled": bool(config["keycloak"]["server_url"]), "browsers_supported_image_mimes": browsers_supported_image_mimes, "keycloak": config["keycloak"], "site_base_url": request.build_absolute_uri("/"), "DJANGO_SETTINGS_MODULE": os.environ["DJANGO_SETTINGS_MODULE"], "status": config["status"], "swh_web_dev": is_swh_web_development(request), "swh_web_staging": is_swh_web_staging(request), "swh_web_version": get_distribution("swh.web").version, "iframe_mode": False, "ADMIN_LIST_DEPOSIT_PERMISSION": ADMIN_LIST_DEPOSIT_PERMISSION, "ADD_FORGE_MODERATOR_PERMISSION": ADD_FORGE_MODERATOR_PERMISSION, "FEATURES": get_config()["features"], "MAILMAP_ADMIN_PERMISSION": MAILMAP_ADMIN_PERMISSION, } def resolve_branch_alias( snapshot: Dict[str, Any], branch: Optional[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: """ Resolve branch alias in snapshot content. Args: snapshot: a full snapshot content branch: a branch alias contained in the snapshot Returns: The real snapshot branch that got aliased. """ while branch and branch["target_type"] == "alias": if branch["target"] in snapshot["branches"]: branch = snapshot["branches"][branch["target"]] else: from swh.web.common import archive snp = archive.lookup_snapshot( snapshot["id"], branches_from=branch["target"], branches_count=1 ) if snp and branch["target"] in snp["branches"]: branch = snp["branches"][branch["target"]] else: branch = None return branch class _NoHeaderHTMLTranslator(HTMLTranslator): """ Docutils translator subclass to customize the generation of HTML from reST-formatted docstrings """ def __init__(self, document): super().__init__(document) self.body_prefix = [] self.body_suffix = [] _HTML_WRITER = Writer() _HTML_WRITER.translator_class = _NoHeaderHTMLTranslator def rst_to_html(rst: str) -> str: """ Convert reStructuredText document into HTML. Args: rst: A string containing a reStructuredText document Returns: Body content of the produced HTML conversion. """ settings = { "initial_header_level": 2, "halt_level": 4, "traceback": True, "file_insertion_enabled": False, "raw_enabled": False, } pp = publish_parts(rst, writer=_HTML_WRITER, settings_overrides=settings) return f'<div class="swh-rst">{pp["html_body"]}</div>' def prettify_html(html: str) -> str: """ Prettify an HTML document. Args: html: Input HTML document Returns: The prettified HTML document """ return BeautifulSoup(html, "lxml").prettify() def django_cache( timeout: int = DEFAULT_TIMEOUT, catch_exception: bool = False, exception_return_value: Any = None, invalidate_cache_pred: Callable[[Any], bool] = lambda val: False, ): """Decorator to put the result of a function call in Django cache, subsequent calls will directly return the cached value. Args: timeout: The number of seconds value will be hold in cache catch_exception: If :const:`True`, any thrown exception by the decorated function will be caught and not reraised exception_return_value: The value to return if previous parameter is set to :const:`True` invalidate_cache_pred: A predicate function enabling to invalidate the cache under certain conditions, decorated function will then be called again Returns: The returned value of the decorated function for the specified parameters """ def inner(func): @functools.wraps(func) def wrapper(*args, **kwargs): func_args = args + (0,) + tuple(sorted(kwargs.items())) cache_key = str(hash((func.__module__, func.__name__) + func_args)) ret = cache.get(cache_key) if ret is None or invalidate_cache_pred(ret): try: ret = func(*args, **kwargs) except Exception as exc: sentry_sdk.capture_exception(exc) if catch_exception: return exception_return_value else: raise else: cache.set(cache_key, ret, timeout=timeout) return ret return wrapper return inner def _deposits_list_url( deposits_list_base_url: str, page_size: int, username: Optional[str] ) -> str: params = {"page_size": str(page_size)} if username is not None: params["username"] = username return f"{deposits_list_base_url}?{urllib.parse.urlencode(params)}" def get_deposits_list(username: Optional[str] = None) -> List[Dict[str, Any]]: - """Return the list of software deposits using swh-deposit API - """ + """Return the list of software deposits using swh-deposit API""" config = get_config()["deposit"] deposits_list_base_url = config["private_api_url"] + "deposits" deposits_list_auth = HTTPBasicAuth( config["private_api_user"], config["private_api_password"] ) deposits_list_url = _deposits_list_url( deposits_list_base_url, page_size=1, username=username ) nb_deposits = requests.get( deposits_list_url, auth=deposits_list_auth, timeout=30 ).json()["count"] @django_cache(invalidate_cache_pred=lambda data: data["count"] != nb_deposits) def _get_deposits_data(): deposits_list_url = _deposits_list_url( deposits_list_base_url, page_size=nb_deposits, username=username ) return requests.get( - deposits_list_url, auth=deposits_list_auth, timeout=30, + deposits_list_url, + auth=deposits_list_auth, + timeout=30, ).json() deposits_data = _get_deposits_data() return deposits_data["results"] @django_cache() def get_deposit_raw_metadata(deposit_id: int) -> Optional[str]: config = get_config()["deposit"] url = f"{config['private_api_url']}/{deposit_id}/meta" return requests.get(url).json()["raw_metadata"] _origin_visit_types_cache_timeout = 24 * 60 * 60 # 24 hours @django_cache( timeout=_origin_visit_types_cache_timeout, catch_exception=True, exception_return_value=[], ) def origin_visit_types() -> List[str]: """Return the exhaustive list of visit types for origins ingested into the archive. """ return sorted(search().visit_types_count().keys()) def redirect_to_new_route(request, new_route, permanent=True): """Redirect a request to another route with url args and query parameters eg: /origin/<url:url-val>/log?path=test can be redirected as /log?url=<url-val>&path=test. This can be used to deprecate routes """ request_path = resolve(request.path_info) args = {**request_path.kwargs, **request.GET.dict()} - return redirect(reverse(new_route, query_params=args), permanent=permanent,) + return redirect( + reverse(new_route, query_params=args), + permanent=permanent, + ) NAMESPACES = { "swh": "https://www.softwareheritage.org/schema/2018/deposit", "schema": "http://schema.org/", } def parse_swh_metadata_provenance(raw_metadata: str) -> Optional[str]: """Parse swh metadata-provenance out of the raw metadata deposit. If found, returns the value, None otherwise. .. code-block:: xml <swh:deposit> <swh:metadata-provenance> <schema:url>https://example.org/metadata/url</schema:url> </swh:metadata-provenance> </swh:deposit> Args: raw_metadata: raw metadata out of deposits received Returns: Either the metadata provenance url if any or None otherwise """ metadata = ElementTree.fromstring(raw_metadata) url = metadata.findtext( - "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES, + "swh:deposit/swh:metadata-provenance/schema:url", + namespaces=NAMESPACES, ) return url or None def parse_swh_deposit_origin(raw_metadata: str) -> Optional[str]: """Parses <swh:add_to_origin> and <swh:create_origin> from metadata document, if any. They are mutually exclusive and tested as such in the deposit. .. code-block:: xml <swh:deposit> <swh:create_origin> <swh:origin url='https://example.org/repo/software123/'/> </swh:reference> </swh:deposit> .. code-block:: xml <swh:deposit> <swh:add_to_origin> <swh:origin url='https://example.org/repo/software123/'/> </swh:add_to_origin> </swh:deposit> Returns: The one not null if any, None otherwise """ metadata = ElementTree.fromstring(raw_metadata) for origin_tag in ["create_origin", "add_to_origin"]: elt = metadata.find( f"swh:deposit/swh:{origin_tag}/swh:origin[@url]", namespaces=NAMESPACES ) if elt is not None: return elt.attrib["url"] return None def has_add_forge_now_permission(user) -> bool: """Is a user considered an add-forge-now moderator? Returns True if a user is staff or has add forge now moderator permission """ return user.is_staff or user.has_perm(ADD_FORGE_MODERATOR_PERMISSION) diff --git a/swh/web/config.py b/swh/web/config.py index 05ec6ee9..40b6c4aa 100644 --- a/swh/web/config.py +++ b/swh/web/config.py @@ -1,218 +1,232 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.counters import get_counters from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings SWH_WEB_SERVER_NAME = "archive.softwareheritage.org" SWH_WEB_INTERNAL_SERVER_NAME = "archive.internal.softwareheritage.org" SWH_WEB_STAGING_SERVER_NAMES = [ "webapp.staging.swh.network", "webapp.internal.staging.swh.network", ] SETTINGS_DIR = os.path.dirname(settings.__file__) DEFAULT_CONFIG = { "allowed_hosts": ("list", []), "storage": ( "dict", - {"cls": "remote", "url": "http://127.0.0.1:5002/", "timeout": 10,}, + { + "cls": "remote", + "url": "http://127.0.0.1:5002/", + "timeout": 10, + }, ), "indexer_storage": ( "dict", - {"cls": "remote", "url": "http://127.0.0.1:5007/", "timeout": 1,}, + { + "cls": "remote", + "url": "http://127.0.0.1:5007/", + "timeout": 1, + }, ), "counters": ( "dict", - {"cls": "remote", "url": "http://127.0.0.1:5011/", "timeout": 1,}, + { + "cls": "remote", + "url": "http://127.0.0.1:5011/", + "timeout": 1, + }, ), "search": ( "dict", - {"cls": "remote", "url": "http://127.0.0.1:5010/", "timeout": 10,}, + { + "cls": "remote", + "url": "http://127.0.0.1:5010/", + "timeout": 10, + }, ), "search_config": ( "dict", - {"metadata_backend": "swh-indexer-storage",}, # or "swh-search" + { + "metadata_backend": "swh-indexer-storage", + }, # or "swh-search" ), "log_dir": ("string", "/tmp/swh/log"), "debug": ("bool", False), "serve_assets": ("bool", False), "host": ("string", "127.0.0.1"), "port": ("int", 5004), "secret_key": ("string", "development key"), # do not display code highlighting for content > 1MB "content_display_max_size": ("int", 5 * 1024 * 1024), "snapshot_content_max_size": ("int", 1000), "throttling": ( "dict", { "cache_uri": None, # production: memcached as cache (127.0.0.1:11211) # development: in-memory cache so None "scopes": { "swh_api": { "limiter_rate": {"default": "120/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_search": { "limiter_rate": {"default": "10/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_vault_cooking": { "limiter_rate": {"default": "120/h", "GET": "60/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_save_origin": { "limiter_rate": {"default": "120/h", "POST": "10/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_visit_latest": { "limiter_rate": {"default": "700/m"}, "exempted_networks": ["127.0.0.0/8"], }, }, }, ), - "vault": ("dict", {"cls": "remote", "args": {"url": "http://127.0.0.1:5005/",}}), + "vault": ( + "dict", + { + "cls": "remote", + "args": { + "url": "http://127.0.0.1:5005/", + }, + }, + ), "scheduler": ("dict", {"cls": "remote", "url": "http://127.0.0.1:5008/"}), "development_db": ("string", os.path.join(SETTINGS_DIR, "db.sqlite3")), "test_db": ("dict", {"name": "swh-web-test"}), "production_db": ("dict", {"name": "swh-web"}), "deposit": ( "dict", { "private_api_url": "https://deposit.softwareheritage.org/1/private/", "private_api_user": "swhworker", "private_api_password": "some-password", }, ), "e2e_tests_mode": ("bool", False), "es_workers_index_url": ("string", ""), "history_counters_url": ( "string", ( "http://counters1.internal.softwareheritage.org:5011" "/counters_history/history.json" ), ), "client_config": ("dict", {}), "keycloak": ("dict", {"server_url": "", "realm_name": ""}), "graph": ( "dict", { "server_url": "http://graph.internal.softwareheritage.org:5009/graph/", "max_edges": {"staff": 0, "user": 100000, "anonymous": 1000}, }, ), "status": ( "dict", { "server_url": "https://status.softwareheritage.org/", "json_path": "1.0/status/578e5eddcdc0cc7951000520", }, ), "counters_backend": ("string", "swh-storage"), # or "swh-counters" "staging_server_names": ("list", SWH_WEB_STAGING_SERVER_NAMES), "instance_name": ("str", "archive-test.softwareheritage.org"), "give": ("dict", {"public_key": "", "token": ""}), "features": ("dict", {"add_forge_now": True}), } swhweb_config: Dict[str, Any] = {} def get_config(config_file="web/web"): """Read the configuration file `config_file`. - If an environment variable SWH_CONFIG_FILENAME is defined, this - takes precedence over the config_file parameter. + If an environment variable SWH_CONFIG_FILENAME is defined, this + takes precedence over the config_file parameter. - In any case, update the app with parameters (secret_key, conf) - and return the parsed configuration as a dict. + In any case, update the app with parameters (secret_key, conf) + and return the parsed configuration as a dict. - If no configuration file is provided, return a default - configuration. + If no configuration file is provided, return a default + configuration. """ if not swhweb_config: config_filename = os.environ.get("SWH_CONFIG_FILENAME") if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, "log_dir") if swhweb_config.get("search"): swhweb_config["search"] = get_search(**swhweb_config["search"]) else: swhweb_config["search"] = None swhweb_config["storage"] = get_storage(**swhweb_config["storage"]) swhweb_config["vault"] = get_vault(**swhweb_config["vault"]) swhweb_config["indexer_storage"] = get_indexer_storage( **swhweb_config["indexer_storage"] ) swhweb_config["scheduler"] = get_scheduler(**swhweb_config["scheduler"]) swhweb_config["counters"] = get_counters(**swhweb_config["counters"]) return swhweb_config def search(): - """Return the current application's search. - - """ + """Return the current application's search.""" return get_config()["search"] def storage(): - """Return the current application's storage. - - """ + """Return the current application's storage.""" return get_config()["storage"] def vault(): - """Return the current application's vault. - - """ + """Return the current application's vault.""" return get_config()["vault"] def indexer_storage(): - """Return the current application's indexer storage. - - """ + """Return the current application's indexer storage.""" return get_config()["indexer_storage"] def scheduler(): - """Return the current application's scheduler. - - """ + """Return the current application's scheduler.""" return get_config()["scheduler"] def counters(): - """Return the current application's counters. - - """ + """Return the current application's counters.""" return get_config()["counters"] def is_feature_enabled(feature_name: str) -> bool: """Determine whether a feature is enabled or not. If feature_name is not found at all, it's considered disabled. """ return get_config()["features"].get(feature_name, False) diff --git a/swh/web/misc/badges.py b/swh/web/misc/badges.py index c5548f32..7fa846af 100644 --- a/swh/web/misc/badges.py +++ b/swh/web/misc/badges.py @@ -1,168 +1,186 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from base64 import b64encode from typing import Optional, cast from pybadges import badge from django.conf.urls import url from django.contrib.staticfiles import finders from django.http import HttpRequest, HttpResponse from swh.model.exceptions import ValidationError from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.web.common import archive from swh.web.common.exc import BadInputExc, NotFoundExc from swh.web.common.identifiers import parse_object_type, resolve_swhid from swh.web.common.utils import reverse _orange = "#f36a24" _blue = "#0172b2" _red = "#cd5741" _swh_logo_data = None _badge_config = { - "content": {"color": _blue, "title": "Archived source file",}, - "directory": {"color": _blue, "title": "Archived source tree",}, - "origin": {"color": _orange, "title": "Archived software repository",}, - "release": {"color": _blue, "title": "Archived software release",}, - "revision": {"color": _blue, "title": "Archived commit",}, - "snapshot": {"color": _blue, "title": "Archived software repository snapshot",}, + "content": { + "color": _blue, + "title": "Archived source file", + }, + "directory": { + "color": _blue, + "title": "Archived source tree", + }, + "origin": { + "color": _orange, + "title": "Archived software repository", + }, + "release": { + "color": _blue, + "title": "Archived software release", + }, + "revision": { + "color": _blue, + "title": "Archived commit", + }, + "snapshot": { + "color": _blue, + "title": "Archived software repository snapshot", + }, "error": {"color": _red, "title": "An error occurred when generating the badge"}, } def _get_logo_data() -> str: """ Get data-URI for Software Heritage SVG logo to embed it in the generated badges. """ global _swh_logo_data if _swh_logo_data is None: swh_logo_path = cast(str, finders.find("img/swh-logo-white.svg")) with open(swh_logo_path, "rb") as swh_logo_file: _swh_logo_data = "data:image/svg+xml;base64,%s" % b64encode( swh_logo_file.read() ).decode("ascii") return _swh_logo_data def _swh_badge( request: HttpRequest, object_type: str, object_id: str, object_swhid: Optional[str] = "", ) -> HttpResponse: """ Generate a Software Heritage badge for a given object type and id. Args: request: input http request object_type: The type of swh object to generate a badge for, either *content*, *directory*, *revision*, *release*, *origin* or *snapshot* object_id: The id of the swh object, either an url for origin type or a *sha1* for other object types object_swhid: If provided, the object SWHID will not be recomputed Returns: HTTP response with content type *image/svg+xml* containing the SVG badge data. If the provided parameters are invalid, HTTP 400 status code will be returned. If the object can not be found in the archive, HTTP 404 status code will be returned. """ left_text = "error" whole_link = None try: if object_type == "origin": archive.lookup_origin({"url": object_id}) right_text = "repository" whole_link = reverse( "browse-origin", query_params={"origin_url": object_id} ) else: # when SWHID is provided, object type and id will be parsed # from it if object_swhid: parsed_swhid = QualifiedSWHID.from_string(object_swhid) parsed_object_type = parsed_swhid.object_type object_id = hash_to_hex(parsed_swhid.object_id) swh_object = archive.lookup_object(parsed_swhid.object_type, object_id) # remove SWHID qualified if any for badge text right_text = str( CoreSWHID( object_type=parsed_swhid.object_type, object_id=parsed_swhid.object_id, ) ) object_type = parsed_swhid.object_type.name.lower() else: parsed_object_type = parse_object_type(object_type) right_text = str( CoreSWHID( object_type=parsed_object_type, object_id=hash_to_bytes(object_id), ) ) swh_object = archive.lookup_object(parsed_object_type, object_id) whole_link = resolve_swhid(str(right_text))["browse_url"] # use release name for badge text if parsed_object_type == ObjectType.RELEASE: right_text = "release %s" % swh_object["name"] left_text = "archived" except (BadInputExc, ValidationError): right_text = f'invalid {object_type if object_type else "object"} id' object_type = "error" except NotFoundExc: right_text = f'{object_type if object_type else "object"} not found' object_type = "error" badge_data = badge( left_text=left_text, right_text=right_text, right_color=_badge_config[object_type]["color"], whole_link=request.build_absolute_uri(whole_link), whole_title=_badge_config[object_type]["title"], logo=_get_logo_data(), embed_logo=True, ) return HttpResponse(badge_data, content_type="image/svg+xml") def _swh_badge_swhid(request: HttpRequest, object_swhid: str) -> HttpResponse: """ Generate a Software Heritage badge for a given object SWHID. Args: request (django.http.HttpRequest): input http request object_swhid (str): a SWHID of an archived object Returns: django.http.HttpResponse: An http response with content type *image/svg+xml* containing the SVG badge data. If any error occurs, a status code of 400 will be returned. """ return _swh_badge(request, "", "", object_swhid) urlpatterns = [ url( r"^badge/(?P<object_type>[a-z]+)/(?P<object_id>.+)/$", _swh_badge, name="swh-badge", ), url( r"^badge/(?P<object_swhid>swh:[0-9]+:[a-z]+:[0-9a-f]+.*)/$", _swh_badge_swhid, name="swh-badge-swhid", ), ] diff --git a/swh/web/misc/coverage.py b/swh/web/misc/coverage.py index 6d242ca3..a6d9225d 100644 --- a/swh/web/misc/coverage.py +++ b/swh/web/misc/coverage.py @@ -1,431 +1,430 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter, defaultdict from typing import Any, Dict, List, Tuple from urllib.parse import urlparse from django.conf.urls import url from django.http.request import HttpRequest from django.http.response import HttpResponse from django.shortcuts import render from django.views.decorators.cache import never_cache from django.views.decorators.clickjacking import xframe_options_exempt from swh.scheduler.model import SchedulerMetrics from swh.web.common import archive from swh.web.common.origin_save import get_savable_visit_types from swh.web.common.utils import ( django_cache, get_deposits_list, is_swh_web_production, reverse, ) from swh.web.config import scheduler _swh_arch_overview_doc = ( "https://docs.softwareheritage.org/devel/architecture/overview.html" ) # Current coverage list of the archive in a high level overview fashion, # categorized as follow: # - listed origins: origins discovered using a swh lister # - legacy: origins where public hosting service has closed # - deposited: origins coming from swh-deposit # # TODO: Store that list in a database table somewhere (swh-scheduler, swh-storage ?) # and retrieve it dynamically listed_origins: Dict[str, Any] = { "info": ( "These software origins get continuously discovered and archived using " f'the <a href="{_swh_arch_overview_doc}#listers" target="_blank" ' 'rel="noopener noreferrer">listers</a> implemented by Software Heritage.' ), "origins": [ { "type": "bitbucket", "info_url": "https://bitbucket.org", "info": "public repositories from Bitbucket", "search_pattern": "https://bitbucket.org/", }, { "type": "cgit", "info_url": "https://git.zx2c4.com/cgit/about", "info": "public repositories from cgit instances", "search_pattern": "cgit", }, { "type": "CRAN", "info_url": "https://cran.r-project.org", "info": "source packages from The Comprehensive R Archive Network", "search_pattern": "https://cran.r-project.org/", }, { "type": "debian", "info_url": "https://www.debian.org", "info": "source packages from Debian and Debian-based distributions", "search_pattern": "deb://", }, { "type": "gitea", "info_url": "https://gitea.io", "info": "public repositories from Gitea instances", "search_pattern": "gitea", }, { "type": "github", "info_url": "https://github.com", "info": "public repositories from GitHub", "search_pattern": "https://github.com/", }, { "type": "gitlab", "info_url": "https://gitlab.com", "info": "public repositories from multiple GitLab instances", "search_pattern": "gitlab", }, { "type": "guix", "info_url": "https://guix.gnu.org", "info": "source code tarballs used to build the Guix package collection", "visit_types": ["nixguix"], "search_pattern": "https://guix.gnu.org/sources.json", }, { "type": "GNU", "info_url": "https://www.gnu.org", "info": "releases from the GNU project (as of August 2015)", "search_pattern": "gnu", }, { "type": "heptapod", "info_url": "https://heptapod.net/", "info": "public repositories from multiple Heptapod instances", "search_pattern": "heptapod", }, { "type": "launchpad", "info_url": "https://launchpad.net", "logo": "img/logos/launchpad.png", "info": "public repositories from Launchpad", "search_pattern": "https://git.launchpad.net/", }, { "type": "nixos", "info_url": "https://nixos.org", "info": "source code tarballs used to build the Nix package collection", "visit_types": ["nixguix"], "search_pattern": ( "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" ), }, { "type": "npm", "info_url": "https://www.npmjs.com", "info": "public packages from the package registry for javascript", "search_pattern": "https://www.npmjs.com", }, { "type": "opam", "info_url": "https://opam.ocaml.org/", "info": "public packages from the source-based package manager for OCaml", "search_pattern": "opam+https://opam.ocaml.org/", }, # apart our forge, most phabricator origins have not been archived # while they have been listed so do not display those type of origins # until new listing processes have been executed and origins loaded # # { # "type": "phabricator", # "info_url": "https://www.phacility.com/phabricator", # "info": "public repositories from multiple Phabricator instances", # "search_pattern": "phabricator", # }, { "type": "pypi", "info_url": "https://pypi.org", "info": "source packages from the Python Package Index", "search_pattern": "https://pypi.org", }, { "type": "sourceforge", "info_url": "https://sourceforge.net", "info": "public repositories from SourceForge", "search_pattern": "code.sf.net", }, ], } legacy_origins: Dict[str, Any] = { "info": ( "Discontinued hosting services. Those origins have been archived " "by Software Heritage." ), "origins": [ { "type": "gitorious", "info_url": "https://en.wikipedia.org/wiki/Gitorious", "info": ( "public repositories from the former Gitorious code hosting service" ), "visit_types": ["git"], "search_pattern": "https://gitorious.org", "count": "122,014", }, { "type": "googlecode", "info_url": "https://code.google.com/archive", "info": ( "public repositories from the former Google Code project " "hosting service" ), "visit_types": ["git", "hg", "svn"], "search_pattern": "googlecode.com", "count": "790,026", }, { "type": "bitbucket", "info_url": "https://bitbucket.org", "info": "public repositories from Bitbucket", "search_pattern": "https://bitbucket.org/", "visit_types": ["hg"], "count": "336,795", }, ], } deposited_origins: Dict[str, Any] = { "info": ( "These origins are directly pushed into the archive by trusted partners " f'using the <a href="{_swh_arch_overview_doc}#deposit" target="_blank" ' 'rel="noopener noreferrer">deposit</a> service of Software Heritage.' ), "origins": [ { "type": "elife", "info_url": "https://elifesciences.org", "info": ( "research software source code associated to the articles " "eLife publishes" ), "search_pattern": "elife.stencila.io", "visit_types": ["deposit"], }, { "type": "hal", "info_url": "https://hal.archives-ouvertes.fr", "info": "scientific software source code deposited in the open archive HAL", "visit_types": ["deposit"], "search_pattern": "hal.archives-ouvertes.fr", }, { "type": "ipol", "info_url": "https://www.ipol.im", "info": "software artifacts associated to the articles IPOL publishes", "visit_types": ["deposit"], "search_pattern": "doi.org/10.5201", }, ], } _cache_timeout = 60 * 60 # one hour def _get_listers_metrics( cache_metrics: bool = False, ) -> Dict[str, List[Tuple[str, SchedulerMetrics]]]: """Returns scheduler metrics in the following mapping: Dict[lister_name, List[Tuple[instance_name, SchedulerMetrics]]] as a lister instance has one SchedulerMetrics object per visit type. """ @django_cache( timeout=_cache_timeout, catch_exception=True, exception_return_value={}, invalidate_cache_pred=lambda m: not cache_metrics, ) def _get_listers_metrics_internal(): listers_metrics = defaultdict(list) listers = scheduler().get_listers() scheduler_metrics = scheduler().get_metrics() for lister in listers: for metrics in filter( lambda m: m.lister_id == lister.id, scheduler_metrics ): listers_metrics[lister.name].append((lister.instance_name, metrics)) return listers_metrics return _get_listers_metrics_internal() def _get_deposits_netloc_counts(cache_counts: bool = False) -> Counter: - """Return deposit counts per origin url network location. - """ + """Return deposit counts per origin url network location.""" def _process_origin_url(origin_url): parsed_url = urlparse(origin_url) netloc = parsed_url.netloc # special treatment for doi.org netloc as it is not specific enough # for origins mapping if parsed_url.netloc == "doi.org": netloc += "/" + parsed_url.path.split("/")[1] return netloc @django_cache( timeout=_cache_timeout, catch_exception=True, exception_return_value=Counter(), invalidate_cache_pred=lambda m: not cache_counts, ) def _get_deposits_netloc_counts_internal(): netlocs = [] deposits = get_deposits_list() netlocs = [ _process_origin_url(d["origin_url"]) for d in deposits if d["status"] == "done" ] deposits_netloc_counts = Counter(netlocs) return deposits_netloc_counts return _get_deposits_netloc_counts_internal() def _get_nixguix_origins_count(origin_url: str, cache_count: bool = False) -> int: """Returns number of archived tarballs for NixOS, aka the number of branches in a dedicated origin in the archive. """ @django_cache( timeout=_cache_timeout, catch_exception=True, exception_return_value=0, invalidate_cache_pred=lambda m: not cache_count, ) def _get_nixguix_origins_count_internal(): snapshot = archive.lookup_latest_origin_snapshot(origin_url) if snapshot: snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"]) nixguix_origins_count = snapshot_sizes["release"] else: nixguix_origins_count = 0 return nixguix_origins_count return _get_nixguix_origins_count_internal() def _search_url(query: str, visit_type: str) -> str: return reverse( "browse-search", query_params={ "q": query, "visit_type": visit_type, "with_visit": "true", "with_content": "true", }, ) @xframe_options_exempt @never_cache def _swh_coverage(request: HttpRequest) -> HttpResponse: use_cache = is_swh_web_production(request) listers_metrics = _get_listers_metrics(use_cache) for origins in listed_origins["origins"]: origins["instances"] = {} origins_type = origins["type"] # special processing for nixos/guix origins as there is no # scheduler metrics for those if origins_type in ("nixos", "guix"): count = _get_nixguix_origins_count(origins["search_pattern"], use_cache) origins["count"] = f"{count:,}" if count else "" origins["instances"][origins_type] = {"nixguix": {"count": count}} if origins_type not in listers_metrics: continue count_total = sum( [metrics.origins_known for _, metrics in listers_metrics[origins_type]] ) count_never_visited = sum( [ metrics.origins_never_visited for _, metrics in listers_metrics[origins_type] ] ) count = count_total - count_never_visited origins["count"] = f"{count:,}" origins["instances"] = defaultdict(dict) for instance, metrics in listers_metrics[origins_type]: # these types are available in staging/docker but not yet in production if ( metrics.visit_type in ("bzr", "cvs") and metrics.visit_type not in get_savable_visit_types() ): continue instance_count = metrics.origins_known - metrics.origins_never_visited origins["instances"][instance].update( {metrics.visit_type: {"count": f"{instance_count:,}"}} ) origins["visit_types"] = list( set(origins["instances"][instance].keys()) | set(origins.get("visit_types", [])) ) if origins_type == "CRAN": origins["instances"]["cran"]["cran"] = {"count": origins["count"]} # defaultdict cannot be iterated in django template origins["instances"] = dict(origins["instances"]) for origins in listed_origins["origins"]: instances = origins["instances"] nb_instances = len(instances) for instance_name, visit_types in instances.items(): for visit_type in visit_types: if nb_instances > 1: search_pattern = instance_name else: search_pattern = origins["search_pattern"] search_url = _search_url(search_pattern, visit_type) visit_types[visit_type]["search_url"] = search_url for origins in legacy_origins["origins"]: origins["search_urls"] = {} for visit_type in origins["visit_types"]: origins["search_urls"][visit_type] = _search_url( origins["search_pattern"], visit_type ) deposits_counts = _get_deposits_netloc_counts(use_cache) for origins in deposited_origins["origins"]: if origins["search_pattern"] in deposits_counts: origins["count"] = f"{deposits_counts[origins['search_pattern']]:,}" origins["search_urls"] = { "deposit": _search_url(origins["search_pattern"], "deposit") } return render( request, "misc/coverage.html", { "origins": { "Regular crawling": listed_origins, "Discontinued hosting": legacy_origins, "On demand archival": deposited_origins, } }, ) urlpatterns = [ url(r"^coverage/$", _swh_coverage, name="swh-coverage"), ] diff --git a/swh/web/misc/iframe.py b/swh/web/misc/iframe.py index df9c81bf..ab69e096 100644 --- a/swh/web/misc/iframe.py +++ b/swh/web/misc/iframe.py @@ -1,337 +1,340 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, List, Optional, Tuple from django.conf.urls import url from django.shortcuts import render from django.views.decorators.clickjacking import xframe_options_exempt from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.browse.utils import ( content_display_max_size, get_directory_entries, prepare_content_for_display, request_content, ) from swh.web.common import archive from swh.web.common.exc import BadInputExc, NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhid, get_swhids_info from swh.web.common.typing import SnapshotContext, SWHObjectInfo from swh.web.common.utils import gen_path_info, reverse def _get_content_rendering_data(cnt_swhid: QualifiedSWHID, path: str) -> Dict[str, Any]: content_data = request_content(f"sha1_git:{cnt_swhid.object_id.hex()}") content = None language = None mimetype = None if content_data.get("raw_data") is not None: content_display_data = prepare_content_for_display( content_data["raw_data"], content_data["mimetype"], path ) content = content_display_data["content_data"] language = content_display_data["language"] mimetype = content_display_data["mimetype"] return { "content": content, "content_size": content_data.get("length"), "max_content_size": content_display_max_size, "filename": path.split("/")[-1], "encoding": content_data.get("encoding"), "mimetype": mimetype, "language": language, } def _get_directory_rendering_data( - dir_swhid: QualifiedSWHID, focus_swhid: QualifiedSWHID, path: str, + dir_swhid: QualifiedSWHID, + focus_swhid: QualifiedSWHID, + path: str, ) -> Dict[str, Any]: dirs, files = get_directory_entries(dir_swhid.object_id.hex()) for d in dirs: if d["type"] == "rev": d["url"] = None else: dir_swhid = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(d["target"]), origin=dir_swhid.origin, visit=dir_swhid.visit, anchor=dir_swhid.anchor, path=(path or "/") + d["name"] + "/", ) d["url"] = reverse( "swhid-iframe", url_args={"swhid": str(dir_swhid)}, query_params={"focus_swhid": str(focus_swhid)}, ) for f in files: object_id = hash_to_bytes(f["target"]) cnt_swhid = QualifiedSWHID( object_type=ObjectType.CONTENT, object_id=object_id, origin=dir_swhid.origin, visit=dir_swhid.visit, anchor=dir_swhid.anchor, path=(path or "/") + f["name"], lines=(focus_swhid.lines if object_id == focus_swhid.object_id else None), ) f["url"] = reverse( "swhid-iframe", url_args={"swhid": str(cnt_swhid)}, query_params={"focus_swhid": str(focus_swhid)}, ) return {"dirs": dirs, "files": files} def _get_breacrumbs_data( swhid: QualifiedSWHID, focus_swhid: QualifiedSWHID, path: str, snapshot_context: Optional[SnapshotContext] = None, ) -> Tuple[List[Dict[str, Any]], Optional[str]]: breadcrumbs = [] filename = None # strip any leading or trailing slash from path qualifier of SWHID if path and path[0] == "/": path = path[1:] if path and path[-1] == "/": path = path[:-1] if swhid.object_type == ObjectType.CONTENT: split_path = path.split("/") filename = split_path[-1] path = path[: -len(filename)] path_info = gen_path_info(path) if path != "/" else [] root_dir = None if snapshot_context and snapshot_context["root_directory"]: root_dir = snapshot_context["root_directory"] elif swhid.anchor and swhid.anchor.object_type == ObjectType.DIRECTORY: root_dir = swhid.anchor.object_id.hex() elif focus_swhid.object_type == ObjectType.DIRECTORY: root_dir = focus_swhid.object_id.hex() if root_dir: root_dir_swhid = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(root_dir), origin=swhid.origin, visit=swhid.visit, anchor=swhid.anchor, ) breadcrumbs.append( { "name": root_dir[:7], "object_id": root_dir_swhid.object_id.hex(), "path": "/", "url": reverse( "swhid-iframe", url_args={"swhid": str(root_dir_swhid)}, query_params={ "focus_swhid": focus_swhid if focus_swhid != root_dir_swhid else None }, ), } ) for pi in path_info: dir_info = archive.lookup_directory_with_path(root_dir, pi["path"]) dir_swhid = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(dir_info["target"]), origin=swhid.origin, visit=swhid.visit, anchor=swhid.anchor, path="/" + pi["path"] + "/", ) breadcrumbs.append( { "name": pi["name"], "object_id": dir_swhid.object_id.hex(), "path": dir_swhid.path.decode("utf-8") if dir_swhid.path else "", "url": reverse( "swhid-iframe", url_args={"swhid": str(dir_swhid)}, query_params={"focus_swhid": focus_swhid}, ), } ) if filename: breadcrumbs.append( { "name": filename, "object_id": swhid.object_id.hex(), "path": path, "url": "", } ) return breadcrumbs, root_dir @xframe_options_exempt def swhid_iframe(request, swhid: str): """Django view that can be embedded in an iframe to display objects archived by Software Heritage (currently contents and directories) in a minimalist Web UI. """ focus_swhid = request.GET.get("focus_swhid", swhid) parsed_swhid = None view_data = {} breadcrumbs: List[Dict[str, Any]] = [] swh_objects = [] snapshot_context = None swhids_info_extra_context = {} archive_link = None try: parsed_swhid = get_swhid(swhid) parsed_focus_swhid = get_swhid(focus_swhid) path = parsed_swhid.path.decode("utf-8") if parsed_swhid.path else "" snapshot_context = None revision_id = None if ( parsed_swhid.anchor and parsed_swhid.anchor.object_type == ObjectType.REVISION ): revision_id = parsed_swhid.anchor.object_id.hex() if parsed_swhid.origin or parsed_swhid.visit: snapshot_context = get_snapshot_context( origin_url=parsed_swhid.origin, snapshot_id=parsed_swhid.visit.object_id.hex() if parsed_swhid.visit else None, revision_id=revision_id, ) error_info: Dict[str, Any] = {"status_code": 200, "description": ""} if parsed_swhid and parsed_swhid.object_type == ObjectType.CONTENT: view_data = _get_content_rendering_data(parsed_swhid, path) swh_objects.append( SWHObjectInfo( object_type=ObjectType.CONTENT, object_id=parsed_swhid.object_id.hex(), ) ) elif parsed_swhid and parsed_swhid.object_type == ObjectType.DIRECTORY: view_data = _get_directory_rendering_data( parsed_swhid, parsed_focus_swhid, path ) swh_objects.append( SWHObjectInfo( object_type=ObjectType.DIRECTORY, object_id=parsed_swhid.object_id.hex(), ) ) elif parsed_swhid: error_info = { "status_code": 400, "description": ( f"Objects of type {parsed_swhid.object_type} are not supported" ), } swhids_info_extra_context["path"] = path if parsed_swhid and view_data: breadcrumbs, root_dir = _get_breacrumbs_data( parsed_swhid, parsed_focus_swhid, path, snapshot_context ) if parsed_swhid.object_type == ObjectType.CONTENT and len(breadcrumbs) > 1: swh_objects.append( SWHObjectInfo( object_type=ObjectType.DIRECTORY, object_id=breadcrumbs[-2]["object_id"], ) ) swhids_info_extra_context["path"] = breadcrumbs[-2]["path"] swhids_info_extra_context["filename"] = breadcrumbs[-1]["name"] if snapshot_context: swh_objects.append( SWHObjectInfo( object_type=ObjectType.REVISION, object_id=snapshot_context["revision_id"] or "", ) ) swh_objects.append( SWHObjectInfo( object_type=ObjectType.SNAPSHOT, object_id=snapshot_context["snapshot_id"] or "", ) ) archive_link = reverse("browse-swhid", url_args={"swhid": swhid}) if ( parsed_swhid.origin is None and parsed_swhid.visit is None and parsed_swhid.anchor is None and root_dir is not None ): # qualifier values cannot be used to get root directory from them, # we need to add it as anchor in the SWHID argument of the archive link root_dir_swhid = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(root_dir) ) archive_swhid = QualifiedSWHID( object_type=parsed_swhid.object_type, object_id=parsed_swhid.object_id, path=parsed_swhid.path, anchor=root_dir_swhid, ) archive_link = reverse( - "browse-swhid", url_args={"swhid": f"{archive_swhid}"}, + "browse-swhid", + url_args={"swhid": f"{archive_swhid}"}, ) except BadInputExc as e: error_info = {"status_code": 400, "description": f"BadInputExc: {str(e)}"} except NotFoundExc as e: error_info = {"status_code": 404, "description": f"NotFoundExc: {str(e)}"} except Exception as e: error_info = {"status_code": 500, "description": str(e)} return render( request, "misc/iframe.html", { **view_data, "iframe_mode": True, "object_type": parsed_swhid.object_type.value if parsed_swhid else None, "lines": parsed_swhid.lines if parsed_swhid else None, "breadcrumbs": breadcrumbs, "swhid": swhid, "focus_swhid": focus_swhid, "archive_link": archive_link, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], "snapshot_context": None, "swhids_info": get_swhids_info( swh_objects, snapshot_context, swhids_info_extra_context ), }, status=error_info["status_code"], ) urlpatterns = [ url( r"^embed/(?P<swhid>swh:[0-9]+:[a-z]+:[0-9a-f]+.*)/$", swhid_iframe, name="swhid-iframe", ), ] diff --git a/swh/web/settings/common.py b/swh/web/settings/common.py index 48c98dc9..b15d2f4c 100644 --- a/swh/web/settings/common.py +++ b/swh/web/settings/common.py @@ -1,294 +1,312 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information """ Django common settings for swh-web. """ import os import sys from typing import Any, Dict from swh.web.auth.utils import OIDC_SWH_WEB_CLIENT_ID from swh.web.config import get_config swh_web_config = get_config() # Build paths inside the project like this: os.path.join(BASE_DIR, ...) PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = swh_web_config["secret_key"] # SECURITY WARNING: don't run with debug turned on in production! DEBUG = swh_web_config["debug"] DEBUG_PROPAGATE_EXCEPTIONS = swh_web_config["debug"] ALLOWED_HOSTS = ["127.0.0.1", "localhost"] + swh_web_config["allowed_hosts"] # Application definition INSTALLED_APPS = [ "django.contrib.admin", "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", "rest_framework", "swh.web.common", "swh.web.inbound_email", "swh.web.api", "swh.web.auth", "swh.web.browse", "swh.web.add_forge_now", "webpack_loader", "django_js_reverse", "corsheaders", ] MIDDLEWARE = [ "django.middleware.security.SecurityMiddleware", "django.contrib.sessions.middleware.SessionMiddleware", "corsheaders.middleware.CorsMiddleware", "django.middleware.common.CommonMiddleware", "django.middleware.csrf.CsrfViewMiddleware", "django.contrib.auth.middleware.AuthenticationMiddleware", "swh.auth.django.middlewares.OIDCSessionExpiredMiddleware", "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", "swh.web.common.middlewares.ThrottlingHeadersMiddleware", "swh.web.common.middlewares.ExceptionMiddleware", ] # Compress all assets (static ones and dynamically generated html) # served by django in a local development environment context. # In a production environment, assets compression will be directly # handled by web servers like apache or nginx. if swh_web_config["serve_assets"]: MIDDLEWARE.insert(0, "django.middleware.gzip.GZipMiddleware") ROOT_URLCONF = "swh.web.urls" TEMPLATES = [ { "BACKEND": "django.template.backends.django.DjangoTemplates", "DIRS": [os.path.join(PROJECT_DIR, "../templates")], "APP_DIRS": True, "OPTIONS": { "context_processors": [ "django.template.context_processors.debug", "django.template.context_processors.request", "django.contrib.auth.context_processors.auth", "django.contrib.messages.context_processors.messages", "swh.web.common.utils.context_processor", ], - "libraries": {"swh_templatetags": "swh.web.common.swh_templatetags",}, + "libraries": { + "swh_templatetags": "swh.web.common.swh_templatetags", + }, }, }, ] DATABASES = { "default": { "ENGINE": "django.db.backends.sqlite3", "NAME": swh_web_config.get("development_db", ""), } } # Password validation # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators AUTH_PASSWORD_VALIDATORS = [ { "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa }, - {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",}, - {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",}, - {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",}, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, ] # Internationalization # https://docs.djangoproject.com/en/1.11/topics/i18n/ LANGUAGE_CODE = "en-us" TIME_ZONE = "UTC" USE_I18N = True USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.11/howto/static-files/ STATIC_URL = "/static/" # static folder location when swh-web has been installed with pip STATIC_DIR = os.path.join(sys.prefix, "share/swh/web/static") if not os.path.exists(STATIC_DIR): # static folder location when developping swh-web STATIC_DIR = os.path.join(PROJECT_DIR, "../../../static") STATICFILES_DIRS = [STATIC_DIR] INTERNAL_IPS = ["127.0.0.1"] throttle_rates = {} http_requests = ["GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"] throttling = swh_web_config["throttling"] for limiter_scope, limiter_conf in throttling["scopes"].items(): if "default" in limiter_conf["limiter_rate"]: throttle_rates[limiter_scope] = limiter_conf["limiter_rate"]["default"] # for backward compatibility else: throttle_rates[limiter_scope] = limiter_conf["limiter_rate"] # register sub scopes specific for HTTP request types for http_request in http_requests: if http_request in limiter_conf["limiter_rate"]: throttle_rates[limiter_scope + "_" + http_request.lower()] = limiter_conf[ "limiter_rate" ][http_request] REST_FRAMEWORK: Dict[str, Any] = { "DEFAULT_RENDERER_CLASSES": ( "rest_framework.renderers.JSONRenderer", "swh.web.api.renderers.YAMLRenderer", "rest_framework.renderers.TemplateHTMLRenderer", ), "DEFAULT_THROTTLE_CLASSES": ( "swh.web.api.throttling.SwhWebRateThrottle", "swh.web.api.throttling.SwhWebUserRateThrottle", ), "DEFAULT_THROTTLE_RATES": throttle_rates, "DEFAULT_AUTHENTICATION_CLASSES": [ "rest_framework.authentication.SessionAuthentication", "swh.auth.django.backends.OIDCBearerTokenAuthentication", ], "EXCEPTION_HANDLER": "swh.web.api.apiresponse.error_response_handler", } LOGGING = { "version": 1, "disable_existing_loggers": False, "filters": { - "require_debug_false": {"()": "django.utils.log.RequireDebugFalse",}, - "require_debug_true": {"()": "django.utils.log.RequireDebugTrue",}, + "require_debug_false": { + "()": "django.utils.log.RequireDebugFalse", + }, + "require_debug_true": { + "()": "django.utils.log.RequireDebugTrue", + }, }, "formatters": { "request": { "format": "[%(asctime)s] [%(levelname)s] %(request)s %(status_code)s", "datefmt": "%d/%b/%Y %H:%M:%S", }, "simple": { "format": "[%(asctime)s] [%(levelname)s] %(message)s", "datefmt": "%d/%b/%Y %H:%M:%S", }, "verbose": { "format": ( "[%(asctime)s] [%(levelname)s] %(name)s.%(funcName)s:%(lineno)s " "- %(message)s" ), "datefmt": "%d/%b/%Y %H:%M:%S", }, }, "handlers": { "console": { "level": "DEBUG", "filters": ["require_debug_true"], "class": "logging.StreamHandler", "formatter": "simple", }, "file": { "level": "WARNING", "filters": ["require_debug_false"], "class": "logging.FileHandler", "filename": os.path.join(swh_web_config["log_dir"], "swh-web.log"), "formatter": "simple", }, "file_request": { "level": "WARNING", "filters": ["require_debug_false"], "class": "logging.FileHandler", "filename": os.path.join(swh_web_config["log_dir"], "swh-web.log"), "formatter": "request", }, "console_verbose": { "level": "DEBUG", "filters": ["require_debug_true"], "class": "logging.StreamHandler", "formatter": "verbose", }, "file_verbose": { "level": "WARNING", "filters": ["require_debug_false"], "class": "logging.FileHandler", "filename": os.path.join(swh_web_config["log_dir"], "swh-web.log"), "formatter": "verbose", }, - "null": {"class": "logging.NullHandler",}, + "null": { + "class": "logging.NullHandler", + }, }, "loggers": { "": { "handlers": ["console_verbose", "file_verbose"], "level": "DEBUG" if DEBUG else "WARNING", }, "django": { "handlers": ["console"], "level": "DEBUG" if DEBUG else "WARNING", "propagate": False, }, "django.request": { "handlers": ["file_request"], "level": "DEBUG" if DEBUG else "WARNING", "propagate": False, }, "django.db.backends": {"handlers": ["null"], "propagate": False}, - "django.utils.autoreload": {"level": "INFO",}, - "swh.core.statsd": {"level": "INFO",}, + "django.utils.autoreload": { + "level": "INFO", + }, + "swh.core.statsd": { + "level": "INFO", + }, }, } WEBPACK_LOADER = { "DEFAULT": { "CACHE": False, "BUNDLE_DIR_NAME": "./", "STATS_FILE": os.path.join(STATIC_DIR, "webpack-stats.json"), "POLL_INTERVAL": 0.1, "TIMEOUT": None, "IGNORE": [".+\\.hot-update.js", ".+\\.map"], } } LOGIN_URL = "/admin/login/" LOGIN_REDIRECT_URL = "admin" SESSION_ENGINE = "django.contrib.sessions.backends.cache" CACHES = { "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"}, } JS_REVERSE_JS_MINIFY = False CORS_ORIGIN_ALLOW_ALL = True CORS_URLS_REGEX = r"^/(badge|api)/.*$" AUTHENTICATION_BACKENDS = [ "django.contrib.auth.backends.ModelBackend", "swh.auth.django.backends.OIDCAuthorizationCodePKCEBackend", ] SWH_AUTH_SERVER_URL = swh_web_config["keycloak"]["server_url"] SWH_AUTH_REALM_NAME = swh_web_config["keycloak"]["realm_name"] SWH_AUTH_CLIENT_ID = OIDC_SWH_WEB_CLIENT_ID SWH_AUTH_SESSION_EXPIRED_REDIRECT_VIEW = "logout" diff --git a/swh/web/tests/add_forge_now/test_views.py b/swh/web/tests/add_forge_now/test_views.py index 843f5b37..48874ebf 100644 --- a/swh/web/tests/add_forge_now/test_views.py +++ b/swh/web/tests/add_forge_now/test_views.py @@ -1,203 +1,207 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import pytest from swh.web.common.utils import reverse from swh.web.tests.api.views.test_add_forge_now import create_add_forge_request from swh.web.tests.utils import check_http_get_response NB_FORGE_TYPE = 2 NB_FORGES_PER_TYPE = 20 def create_add_forge_requests(client, regular_user, regular_user2): requests = [] for i in range(NB_FORGES_PER_TYPE): request = { "forge_type": "gitlab", "forge_url": f"https://gitlab.example{i:02d}.org", "forge_contact_email": f"admin@gitlab.example{i:02d}.org", "forge_contact_name": f"gitlab.example{i:02d}.org admin", "forge_contact_comment": "user marked as owner in forge members", } create_add_forge_request( - client, regular_user, data=request, + client, + regular_user, + data=request, ) requests.append(request) request = { "forge_type": "gitea", "forge_url": f"https://gitea.example{i:02d}.org", "forge_contact_email": f"admin@gitea.example{i:02d}.org", "forge_contact_name": f"gitea.example{i:02d}.org admin", "forge_contact_comment": "user marked as owner in forge members", } create_add_forge_request( - client, regular_user2, data=request, + client, + regular_user2, + data=request, ) requests.append(request) return requests @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_datatables_no_parameters( client, regular_user, regular_user2 ): create_add_forge_requests(client, regular_user, regular_user2) url = reverse("add-forge-request-list-datatables") resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) length = 10 assert data["draw"] == 0 assert data["recordsFiltered"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["recordsTotal"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert len(data["data"]) == length # default ordering is by descending id assert data["data"][0]["id"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["data"][-1]["id"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE - length + 1 assert "submitter_name" not in data["data"][0] @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_datatables( client, regular_user, regular_user2, add_forge_moderator ): create_add_forge_requests(client, regular_user, regular_user2) length = 10 url = reverse( "add-forge-request-list-datatables", query_params={"draw": 1, "length": length, "start": 0}, ) client.force_login(regular_user) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == 1 assert data["recordsFiltered"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["recordsTotal"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert len(data["data"]) == length # default ordering is by descending id assert data["data"][0]["id"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["data"][-1]["id"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE - length + 1 assert "submitter_name" not in data["data"][0] client.force_login(add_forge_moderator) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == 1 assert data["recordsFiltered"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["recordsTotal"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert len(data["data"]) == length # default ordering is by descending id assert data["data"][0]["id"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["data"][-1]["id"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE - length + 1 assert "submitter_name" in data["data"][0] @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_datatables_ordering( client, regular_user, regular_user2 ): requests = create_add_forge_requests(client, regular_user, regular_user2) requests_sorted = list(sorted(requests, key=lambda d: d["forge_url"])) forge_urls_asc = [request["forge_url"] for request in requests_sorted] forge_urls_desc = list(reversed(forge_urls_asc)) length = 10 for direction in ("asc", "desc"): for i in range(4): url = reverse( "add-forge-request-list-datatables", query_params={ "draw": 1, "length": length, "start": i * length, "order[0][column]": 2, "order[0][dir]": direction, "columns[2][name]": "forge_url", }, ) client.force_login(regular_user) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == 1 assert data["recordsFiltered"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert data["recordsTotal"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert len(data["data"]) == length page_forge_urls = [request["forge_url"] for request in data["data"]] if direction == "asc": expected_forge_urls = forge_urls_asc[i * length : (i + 1) * length] else: expected_forge_urls = forge_urls_desc[i * length : (i + 1) * length] assert page_forge_urls == expected_forge_urls @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_datatables_search(client, regular_user, regular_user2): create_add_forge_requests(client, regular_user, regular_user2) url = reverse( "add-forge-request-list-datatables", query_params={ "draw": 1, "length": NB_FORGES_PER_TYPE, "start": 0, "search[value]": "gitlab", }, ) client.force_login(regular_user) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == 1 assert data["recordsFiltered"] == NB_FORGES_PER_TYPE assert data["recordsTotal"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert len(data["data"]) == NB_FORGES_PER_TYPE page_forge_type = [request["forge_type"] for request in data["data"]] assert page_forge_type == ["gitlab"] * NB_FORGES_PER_TYPE @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_datatables_user_requests( client, regular_user, regular_user2 ): create_add_forge_requests(client, regular_user, regular_user2) url = reverse( "add-forge-request-list-datatables", query_params={ "draw": 1, "length": NB_FORGES_PER_TYPE * NB_FORGE_TYPE, "start": 0, "user_requests_only": 1, }, ) client.force_login(regular_user2) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == 1 assert data["recordsFiltered"] == NB_FORGES_PER_TYPE assert data["recordsTotal"] == NB_FORGE_TYPE * NB_FORGES_PER_TYPE assert len(data["data"]) == NB_FORGES_PER_TYPE page_forge_type = [request["forge_type"] for request in data["data"]] assert page_forge_type == ["gitea"] * NB_FORGES_PER_TYPE diff --git a/swh/web/tests/api/views/test_add_forge_now.py b/swh/web/tests/api/views/test_add_forge_now.py index 85999a24..a6e7453c 100644 --- a/swh/web/tests/api/views/test_add_forge_now.py +++ b/swh/web/tests/api/views/test_add_forge_now.py @@ -1,523 +1,537 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import threading import time from typing import Dict from urllib.parse import urlencode import iso8601 import pytest from swh.web.add_forge_now.models import Request from swh.web.common.utils import reverse from swh.web.tests.utils import ( check_api_get_responses, check_api_post_response, check_http_post_response, ) @pytest.mark.django_db def test_add_forge_request_create_anonymous_user(api_client): url = reverse("api-1-add-forge-request-create") check_api_post_response(api_client, url, status_code=403) @pytest.mark.django_db def test_add_forge_request_create_empty(api_client, regular_user): api_client.force_login(regular_user) url = reverse("api-1-add-forge-request-create") resp = check_api_post_response(api_client, url, status_code=400) assert '"forge_type"' in resp.data["reason"] ADD_FORGE_DATA_FORGE1: Dict = { "forge_type": "gitlab", "forge_url": "https://gitlab.example.org", "forge_contact_email": "admin@gitlab.example.org", "forge_contact_name": "gitlab.example.org admin", "forge_contact_comment": "user marked as owner in forge members", "submitter_forward_username": True, } ADD_FORGE_DATA_FORGE2: Dict = { "forge_type": "gitea", "forge_url": "https://gitea.example.org", "forge_contact_email": "admin@gitea.example.org", "forge_contact_name": "gitea.example.org admin", "forge_contact_comment": "user marked as owner in forge members", "submitter_forward_username": True, } ADD_FORGE_DATA_FORGE3: Dict = { "forge_type": "heptapod", "forge_url": "https://heptapod.host/", "forge_contact_email": "admin@example.org", "forge_contact_name": "heptapod admin", "forge_contact_comment": "", # authorized empty or null comment "submitter_forward_username": False, } ADD_FORGE_DATA_FORGE4: Dict = { **ADD_FORGE_DATA_FORGE3, "forge_url": "https://heptapod2.host/", "submitter_forward_username": "on", } ADD_FORGE_DATA_FORGE5: Dict = { **ADD_FORGE_DATA_FORGE3, "forge_url": "https://heptapod3.host/", "submitter_forward_username": "off", } @pytest.mark.django_db(transaction=True, reset_sequences=True) @pytest.mark.parametrize( "add_forge_data", [ ADD_FORGE_DATA_FORGE1, ADD_FORGE_DATA_FORGE2, ADD_FORGE_DATA_FORGE3, ADD_FORGE_DATA_FORGE4, ], ) def test_add_forge_request_create_success_post( api_client, regular_user, add_forge_data ): api_client.force_login(regular_user) url = reverse("api-1-add-forge-request-create") date_before = datetime.datetime.now(tz=datetime.timezone.utc) resp = check_api_post_response( - api_client, url, data=add_forge_data, status_code=201, + api_client, + url, + data=add_forge_data, + status_code=201, ) date_after = datetime.datetime.now(tz=datetime.timezone.utc) consent = add_forge_data["submitter_forward_username"] # map the expected result with what's expectedly read from the db to ease comparison expected_consent_bool = consent == "on" if isinstance(consent, str) else consent assert resp.data == { **add_forge_data, "id": resp.data["id"], "status": "PENDING", "submission_date": resp.data["submission_date"], "submitter_name": regular_user.username, "submitter_email": regular_user.email, "submitter_forward_username": expected_consent_bool, } assert date_before < iso8601.parse_date(resp.data["submission_date"]) < date_after request = Request.objects.all().last() assert request.forge_url == add_forge_data["forge_url"] assert request.submitter_name == regular_user.username @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_create_success_form_encoded(client, regular_user): client.force_login(regular_user) url = reverse("api-1-add-forge-request-create") date_before = datetime.datetime.now(tz=datetime.timezone.utc) resp = check_http_post_response( client, url, request_content_type="application/x-www-form-urlencoded", data=urlencode(ADD_FORGE_DATA_FORGE1), status_code=201, ) date_after = datetime.datetime.now(tz=datetime.timezone.utc) assert resp.data == { **ADD_FORGE_DATA_FORGE1, "id": 1, "status": "PENDING", "submission_date": resp.data["submission_date"], "submitter_name": regular_user.username, "submitter_email": regular_user.email, } assert date_before < iso8601.parse_date(resp.data["submission_date"]) < date_after request = Request.objects.all()[0] assert request.forge_url == ADD_FORGE_DATA_FORGE1["forge_url"] assert request.submitter_name == regular_user.username @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_create_duplicate(api_client, regular_user): api_client.force_login(regular_user) url = reverse("api-1-add-forge-request-create") check_api_post_response( - api_client, url, data=ADD_FORGE_DATA_FORGE1, status_code=201, + api_client, + url, + data=ADD_FORGE_DATA_FORGE1, + status_code=201, ) check_api_post_response( - api_client, url, data=ADD_FORGE_DATA_FORGE1, status_code=409, + api_client, + url, + data=ADD_FORGE_DATA_FORGE1, + status_code=409, ) requests = Request.objects.all() assert len(requests) == 1 @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_anonymous_user(api_client): url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response(api_client, url, status_code=403) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_regular_user(api_client, regular_user): api_client.force_login(regular_user) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response(api_client, url, status_code=403) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_non_existent(api_client, add_forge_moderator): api_client.force_login(add_forge_moderator) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response(api_client, url, status_code=400) def create_add_forge_request(api_client, regular_user, data=ADD_FORGE_DATA_FORGE1): api_client.force_login(regular_user) url = reverse("api-1-add-forge-request-create") - return check_api_post_response(api_client, url, data=data, status_code=201,) + return check_api_post_response( + api_client, + url, + data=data, + status_code=201, + ) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_empty(api_client, regular_user, add_forge_moderator): create_add_forge_request(api_client, regular_user) api_client.force_login(add_forge_moderator) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response(api_client, url, status_code=400) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_missing_field( api_client, regular_user, add_forge_moderator ): create_add_forge_request(api_client, regular_user) api_client.force_login(add_forge_moderator) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response(api_client, url, data={}, status_code=400) check_api_post_response( api_client, url, data={"new_status": "REJECTED"}, status_code=400 ) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update(api_client, regular_user, add_forge_moderator): create_add_forge_request(api_client, regular_user) api_client.force_login(add_forge_moderator) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response( api_client, url, data={"text": "updating request"}, status_code=200 ) check_api_post_response( api_client, url, data={"new_status": "REJECTED", "text": "request rejected"}, status_code=200, ) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_invalid_new_status( api_client, regular_user, add_forge_moderator ): create_add_forge_request(api_client, regular_user) api_client.force_login(add_forge_moderator) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) check_api_post_response( api_client, url, data={"new_status": "ACCEPTED", "text": "request accepted"}, status_code=400, ) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_update_status_concurrent( api_client, regular_user, add_forge_moderator, mocker ): _block_while_testing = mocker.patch( "swh.web.api.views.add_forge_now._block_while_testing" ) _block_while_testing.side_effect = lambda: time.sleep(1) create_add_forge_request(api_client, regular_user) api_client.force_login(add_forge_moderator) url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) worker_ended = False def worker(): nonlocal worker_ended check_api_post_response( api_client, url, data={"new_status": "WAITING_FOR_FEEDBACK", "text": "waiting for message"}, status_code=200, ) worker_ended = True # this thread will first modify the request status to WAITING_FOR_FEEDBACK thread = threading.Thread(target=worker) thread.start() # the other thread (slower) will attempt to modify the request status to REJECTED # but it will not be allowed as the first faster thread already modified it # and REJECTED state can not be reached from WAITING_FOR_FEEDBACK one time.sleep(0.5) check_api_post_response( api_client, url, data={"new_status": "REJECTED", "text": "request accepted"}, status_code=400, ) thread.join() assert worker_ended @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_anonymous(api_client, regular_user): url = reverse("api-1-add-forge-request-list") resp = check_api_get_responses(api_client, url, status_code=200) assert resp.data == [] create_add_forge_request(api_client, regular_user) resp = check_api_get_responses(api_client, url, status_code=200) add_forge_request = { "forge_url": ADD_FORGE_DATA_FORGE1["forge_url"], "forge_type": ADD_FORGE_DATA_FORGE1["forge_type"], "status": "PENDING", "submission_date": resp.data[0]["submission_date"], "id": 1, } assert resp.data == [add_forge_request] create_add_forge_request(api_client, regular_user, data=ADD_FORGE_DATA_FORGE2) resp = check_api_get_responses(api_client, url, status_code=200) other_forge_request = { "forge_url": ADD_FORGE_DATA_FORGE2["forge_url"], "forge_type": ADD_FORGE_DATA_FORGE2["forge_type"], "status": "PENDING", "submission_date": resp.data[0]["submission_date"], "id": 2, } assert resp.data == [other_forge_request, add_forge_request] @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_moderator( api_client, regular_user, add_forge_moderator ): url = reverse("api-1-add-forge-request-list") create_add_forge_request(api_client, regular_user) create_add_forge_request(api_client, regular_user, data=ADD_FORGE_DATA_FORGE2) api_client.force_login(add_forge_moderator) resp = check_api_get_responses(api_client, url, status_code=200) add_forge_request = { **ADD_FORGE_DATA_FORGE1, "status": "PENDING", "submission_date": resp.data[1]["submission_date"], "submitter_name": regular_user.username, "submitter_email": regular_user.email, "id": 1, } other_forge_request = { **ADD_FORGE_DATA_FORGE2, "status": "PENDING", "submission_date": resp.data[0]["submission_date"], "submitter_name": regular_user.username, "submitter_email": regular_user.email, "id": 2, } assert resp.data == [other_forge_request, add_forge_request] @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_pagination( api_client, regular_user, api_request_factory ): create_add_forge_request(api_client, regular_user) create_add_forge_request(api_client, regular_user, data=ADD_FORGE_DATA_FORGE2) url = reverse("api-1-add-forge-request-list", query_params={"per_page": 1}) resp = check_api_get_responses(api_client, url, 200) assert len(resp.data) == 1 request = api_request_factory.get(url) next_url = reverse( "api-1-add-forge-request-list", query_params={"page": 2, "per_page": 1}, request=request, ) assert resp["Link"] == f'<{next_url}>; rel="next"' resp = check_api_get_responses(api_client, next_url, 200) assert len(resp.data) == 1 prev_url = reverse( "api-1-add-forge-request-list", query_params={"page": 1, "per_page": 1}, request=request, ) assert resp["Link"] == f'<{prev_url}>; rel="previous"' @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_list_submitter_filtering( api_client, regular_user, regular_user2 ): create_add_forge_request(api_client, regular_user) create_add_forge_request(api_client, regular_user2, data=ADD_FORGE_DATA_FORGE2) api_client.force_login(regular_user) url = reverse( "api-1-add-forge-request-list", query_params={"user_requests_only": 1} ) resp = check_api_get_responses(api_client, url, status_code=200) assert len(resp.data) == 1 @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_get(api_client, regular_user, add_forge_moderator): resp = create_add_forge_request(api_client, regular_user) submission_date = resp.data["submission_date"] url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) api_client.force_login(add_forge_moderator) check_api_post_response( api_client, url, data={"new_status": "WAITING_FOR_FEEDBACK", "text": "waiting for message"}, status_code=200, ) api_client.logout() url = reverse("api-1-add-forge-request-get", url_args={"id": 1}) resp = check_api_get_responses(api_client, url, status_code=200) assert resp.data == { "request": { "forge_url": ADD_FORGE_DATA_FORGE1["forge_url"], "forge_type": ADD_FORGE_DATA_FORGE1["forge_type"], "id": 1, "status": "WAITING_FOR_FEEDBACK", "submission_date": submission_date, }, "history": [ { "id": 1, "actor_role": "SUBMITTER", "date": resp.data["history"][0]["date"], "new_status": "PENDING", }, { "id": 2, "actor_role": "MODERATOR", "date": resp.data["history"][1]["date"], "new_status": "WAITING_FOR_FEEDBACK", }, ], } @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_get_moderator(api_client, regular_user, add_forge_moderator): resp = create_add_forge_request(api_client, regular_user) submission_date = resp.data["submission_date"] url = reverse("api-1-add-forge-request-update", url_args={"id": 1}) api_client.force_login(add_forge_moderator) check_api_post_response( api_client, url, data={"new_status": "WAITING_FOR_FEEDBACK", "text": "waiting for message"}, status_code=200, ) url = reverse("api-1-add-forge-request-get", url_args={"id": 1}) resp = check_api_get_responses(api_client, url, status_code=200) assert resp.data == { "request": { **ADD_FORGE_DATA_FORGE1, "id": 1, "status": "WAITING_FOR_FEEDBACK", "submission_date": submission_date, "submitter_name": regular_user.username, "submitter_email": regular_user.email, }, "history": [ { "id": 1, "text": "", "actor": regular_user.username, "actor_role": "SUBMITTER", "date": resp.data["history"][0]["date"], "new_status": "PENDING", }, { "id": 2, "text": "waiting for message", "actor": add_forge_moderator.username, "actor_role": "MODERATOR", "date": resp.data["history"][1]["date"], "new_status": "WAITING_FOR_FEEDBACK", }, ], } @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_add_forge_request_get_invalid(api_client): url = reverse("api-1-add-forge-request-get", url_args={"id": 3}) check_api_get_responses(api_client, url, status_code=400) diff --git a/swh/web/tests/api/views/test_graph.py b/swh/web/tests/api/views/test_graph.py index a8160178..be8d6711 100644 --- a/swh/web/tests/api/views/test_graph.py +++ b/swh/web/tests/api/views/test_graph.py @@ -1,418 +1,428 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import re import textwrap from urllib.parse import unquote, urlparse import pytest from django.http.response import StreamingHttpResponse from swh.model.hashutil import hash_to_bytes from swh.model.swhids import ExtendedObjectType, ExtendedSWHID from swh.web.api.views.graph import API_GRAPH_PERM from swh.web.common.utils import reverse from swh.web.config import SWH_WEB_INTERNAL_SERVER_NAME, get_config from swh.web.tests.utils import check_http_get_response def test_graph_endpoint_no_authentication_for_vpn_users(api_client, requests_mock): graph_query = "stats" url = reverse("api-1-graph", url_args={"graph_query": graph_query}) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json={}, headers={"Content-Type": "application/json"}, ) check_http_get_response( api_client, url, status_code=200, server_name=SWH_WEB_INTERNAL_SERVER_NAME ) def test_graph_endpoint_needs_authentication(api_client): url = reverse("api-1-graph", url_args={"graph_query": "stats"}) check_http_get_response(api_client, url, status_code=401) def _authenticate_graph_user(api_client, keycloak_oidc, is_staff=False): keycloak_oidc.client_permissions = [API_GRAPH_PERM] if is_staff: keycloak_oidc.user_groups = ["/staff"] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") def test_graph_endpoint_needs_permission(api_client, keycloak_oidc, requests_mock): graph_query = "stats" url = reverse("api-1-graph", url_args={"graph_query": graph_query}) oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") check_http_get_response(api_client, url, status_code=403) _authenticate_graph_user(api_client, keycloak_oidc) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json={}, headers={"Content-Type": "application/json"}, ) check_http_get_response(api_client, url, status_code=200) def test_graph_text_plain_response(api_client, keycloak_oidc, requests_mock): _authenticate_graph_user(api_client, keycloak_oidc) graph_query = "leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323" response_text = textwrap.dedent( """\ swh:1:cnt:1d3dace0a825b0535c37c53ed669ef817e9c1b47 swh:1:cnt:6d5b280f4e33589ae967a7912a587dd5cb8dedaa swh:1:cnt:91bef238bf01356a550d416d14bb464c576ac6f4 swh:1:cnt:58a8b925a463b87d49639fda282b8f836546e396 swh:1:cnt:fd32ee0a87e16ccc853dfbeb7018674f9ce008c0 swh:1:cnt:ab7c39871872589a4fc9e249ebc927fb1042c90d swh:1:cnt:93073c02bf3869845977527de16af4d54765838d swh:1:cnt:4251f795b52c54c447a97c9fe904d8b1f993b1e0 swh:1:cnt:c6e7055424332006d07876ffeba684e7e284b383 swh:1:cnt:8459d8867dc3b15ef7ae9683e21cccc9ab2ec887 swh:1:cnt:5f9981d52202815aa947f85b9dfa191b66f51138 swh:1:cnt:00a685ec51bcdf398c15d588ecdedb611dbbab4b swh:1:cnt:e1cf1ea335106a0197a2f92f7804046425a7d3eb swh:1:cnt:07069b38087f88ec192d2c9aff75a502476fd17d swh:1:cnt:f045ee845c7f14d903a2c035b2691a7c400c01f0 """ ) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=response_text, headers={"Content-Type": "text/plain", "Transfer-Encoding": "chunked"}, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response( api_client, url, status_code=200, content_type="text/plain" ) assert isinstance(resp, StreamingHttpResponse) assert b"".join(resp.streaming_content) == response_text.encode() _response_json = { "counts": {"nodes": 17075708289, "edges": 196236587976}, "ratios": { "compression": 0.16, "bits_per_node": 58.828, "bits_per_edge": 5.119, "avg_locality": 2184278529.729, }, "indegree": {"min": 0, "max": 263180117, "avg": 11.4921492364925}, "outdegree": {"min": 0, "max": 1033207, "avg": 11.4921492364925}, } def test_graph_json_response(api_client, keycloak_oidc, requests_mock): _authenticate_graph_user(api_client, keycloak_oidc) graph_query = "stats" requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json=_response_json, headers={"Content-Type": "application/json"}, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response(api_client, url, status_code=200) assert resp.content_type == "application/json" assert resp.data == _response_json def test_graph_ndjson_response(api_client, keycloak_oidc, requests_mock): _authenticate_graph_user(api_client, keycloak_oidc) graph_query = "visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb" response_ndjson = textwrap.dedent( """\ ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"] """ ) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=response_ndjson, headers={ "Content-Type": "application/x-ndjson", "Transfer-Encoding": "chunked", }, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response(api_client, url, status_code=200) assert isinstance(resp, StreamingHttpResponse) assert resp["Content-Type"] == "application/x-ndjson" assert b"".join(resp.streaming_content) == response_ndjson.encode() def test_graph_response_resolve_origins( archive_data, api_client, keycloak_oidc, requests_mock, origin ): hasher = hashlib.sha1() hasher.update(origin["url"].encode()) origin_sha1 = hasher.digest() origin_swhid = str( ExtendedSWHID(object_type=ExtendedObjectType.ORIGIN, object_id=origin_sha1) ) snapshot = archive_data.snapshot_get_latest(origin["url"])["id"] snapshot_swhid = str( ExtendedSWHID( object_type=ExtendedObjectType.SNAPSHOT, object_id=hash_to_bytes(snapshot) ) ) _authenticate_graph_user(api_client, keycloak_oidc) for graph_query, response_text, content_type in ( ( f"visit/nodes/{snapshot_swhid}", f"{snapshot_swhid}\n{origin_swhid}\n", "text/plain", ), ( f"visit/edges/{snapshot_swhid}", f"{snapshot_swhid} {origin_swhid}\n", "text/plain", ), ( f"visit/paths/{snapshot_swhid}", f'["{snapshot_swhid}", "{origin_swhid}"]\n', "application/x-ndjson", ), ): # set two lines response to check resolved origins cache response_text = response_text + response_text requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=response_text, headers={"Content-Type": content_type, "Transfer-Encoding": "chunked"}, ) url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"direction": "backward"}, ) resp = check_http_get_response(api_client, url, status_code=200) assert isinstance(resp, StreamingHttpResponse) assert resp["Content-Type"] == content_type assert b"".join(resp.streaming_content) == response_text.encode() url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"direction": "backward", "resolve_origins": "true"}, ) resp = check_http_get_response(api_client, url, status_code=200) assert isinstance(resp, StreamingHttpResponse) assert resp["Content-Type"] == content_type assert ( b"".join(resp.streaming_content) == response_text.replace(origin_swhid, origin["url"]).encode() ) def test_graph_response_resolve_origins_nothing_to_do( api_client, keycloak_oidc, requests_mock ): _authenticate_graph_user(api_client, keycloak_oidc) graph_query = "stats" requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json=_response_json, headers={"Content-Type": "application/json"}, ) url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"resolve_origins": "true"}, ) resp = check_http_get_response(api_client, url, status_code=200) assert resp.content_type == "application/json" assert resp.data == _response_json def test_graph_response_invalid_accept_header(api_client): url = reverse( "api-1-graph", url_args={"graph_query": "stats"}, query_params={"resolve_origins": "true"}, ) resp = api_client.get(url, HTTP_ACCEPT="text/html") assert resp.status_code == 406 assert resp.content_type == "application/json" assert resp.data["exception"] == "NotAcceptable" assert resp.data["reason"] == "Could not satisfy the request Accept header." def test_graph_error_response(api_client, keycloak_oidc, requests_mock): _authenticate_graph_user(api_client, keycloak_oidc) graph_query = "foo" error_message = "Not found" content_type = "text/plain" requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=error_message, headers={"Content-Type": content_type}, status_code=404, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response(api_client, url, status_code=404) assert resp.content_type == content_type assert resp.content == f'"{error_message}"'.encode() @pytest.mark.parametrize( "graph_query, query_params, expected_graph_query_params", [ ("stats", {}, ""), ("stats", {"resolve_origins": "true"}, "resolve_origins=true"), ("stats?a=1", {}, "a=1"), ("stats%3Fb=2", {}, "b=2"), ("stats?a=1", {"resolve_origins": "true"}, "a=1&resolve_origins=true"), ("stats%3Fb=2", {"resolve_origins": "true"}, "b=2&resolve_origins=true"), ("stats/?a=1", {"a": "2"}, "a=1&a=2"), ("stats/%3Fa=1", {"a": "2"}, "a=1&a=2"), ], ) def test_graph_query_params( api_client, keycloak_oidc, requests_mock, graph_query, query_params, expected_graph_query_params, ): _authenticate_graph_user(api_client, keycloak_oidc) requests_mock.get( re.compile(get_config()["graph"]["server_url"]), json=_response_json, headers={"Content-Type": "application/json"}, ) url = reverse( - "api-1-graph", url_args={"graph_query": graph_query}, query_params=query_params, + "api-1-graph", + url_args={"graph_query": graph_query}, + query_params=query_params, ) check_http_get_response(api_client, url, status_code=200) url = requests_mock.request_history[0].url parsed_url = urlparse(url) assert parsed_url.path == f"/graph/{unquote(graph_query).split('?')[0]}" assert expected_graph_query_params in parsed_url.query def test_graph_endpoint_max_edges_settings(api_client, keycloak_oidc, requests_mock): graph_config = get_config()["graph"] graph_query = "stats" url = reverse("api-1-graph", url_args={"graph_query": graph_query}) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json={}, headers={"Content-Type": "application/json"}, ) # currently unauthenticated user can only use the graph endpoint from # Software Heritage VPN check_http_get_response( api_client, url, status_code=200, server_name=SWH_WEB_INTERNAL_SERVER_NAME ) assert ( f"max_edges={graph_config['max_edges']['anonymous']}" in requests_mock.request_history[0].url ) # standard user _authenticate_graph_user(api_client, keycloak_oidc) check_http_get_response( - api_client, url, status_code=200, + api_client, + url, + status_code=200, ) assert ( f"max_edges={graph_config['max_edges']['user']}" in requests_mock.request_history[1].url ) # staff user _authenticate_graph_user(api_client, keycloak_oidc, is_staff=True) check_http_get_response( - api_client, url, status_code=200, + api_client, + url, + status_code=200, ) assert ( f"max_edges={graph_config['max_edges']['staff']}" in requests_mock.request_history[2].url ) def test_graph_endpoint_max_edges_query_parameter_value( api_client, keycloak_oidc, requests_mock ): graph_config = get_config()["graph"] graph_query = "stats" requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json={}, headers={"Content-Type": "application/json"}, ) _authenticate_graph_user(api_client, keycloak_oidc) max_edges_max_value = graph_config["max_edges"]["user"] max_edges = max_edges_max_value // 2 url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"max_edges": max_edges}, ) check_http_get_response( - api_client, url, status_code=200, + api_client, + url, + status_code=200, ) assert f"max_edges={max_edges}" in requests_mock.request_history[0].url max_edges = max_edges_max_value * 2 url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"max_edges": max_edges}, ) check_http_get_response( - api_client, url, status_code=200, + api_client, + url, + status_code=200, ) assert f"max_edges={max_edges_max_value}" in requests_mock.request_history[1].url diff --git a/swh/web/tests/api/views/test_metadata.py b/swh/web/tests/api/views/test_metadata.py index 7d83e664..0ca2e8af 100644 --- a/swh/web/tests/api/views/test_metadata.py +++ b/swh/web/tests/api/views/test_metadata.py @@ -1,208 +1,214 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import attr from hypothesis import given from hypothesis.strategies import composite, sampled_from, sets import pytest from swh.model.hypothesis_strategies import ( raw_extrinsic_metadata as raw_extrinsic_metadata_orig, ) from swh.model.hypothesis_strategies import sha1_git from swh.model.swhids import ExtendedObjectType, ExtendedSWHID, ObjectType from swh.web.common.utils import reverse from swh.web.tests.api.views.utils import scroll_results from swh.web.tests.utils import check_api_get_responses, check_http_get_response # public Web API endpoint for raw extrinsic metadata does not support # extended SWHIDs so we ensure only core ones will be used in test inputs. @composite def raw_extrinsic_metadata(draw): remd = draw(raw_extrinsic_metadata_orig()) remd = attr.evolve( remd, target=ExtendedSWHID( object_type=ExtendedObjectType(draw(sampled_from(ObjectType)).value), object_id=draw(sha1_git()), ), ) return attr.evolve(remd, id=remd.compute_hash()) @given(raw_extrinsic_metadata()) def test_api_raw_extrinsic_metadata(api_client, subtest, metadata): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.metadata_authority_add([metadata.authority]) archive_data.metadata_fetcher_add([metadata.fetcher]) archive_data.raw_extrinsic_metadata_add([metadata]) authority = metadata.authority url = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": str(metadata.target)}, query_params={"authority": f"{authority.type.value} {authority.url}"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 expected_result = metadata.to_dict() del expected_result["id"] del expected_result["metadata"] metadata_url = rv.data[0]["metadata_url"] expected_result["metadata_url"] = metadata_url expected_result["discovery_date"] = expected_result[ "discovery_date" ].isoformat() assert rv.data == [expected_result] rv = check_http_get_response(api_client, metadata_url, status_code=200) assert rv["Content-Type"] == "application/octet-stream" assert ( rv["Content-Disposition"] == f'attachment; filename="{metadata.target}_metadata"' ) assert rv.content == metadata.metadata @pytest.mark.parametrize("limit", [1, 2, 10, 100]) @given(sets(raw_extrinsic_metadata(), min_size=1)) def test_api_raw_extrinsic_metadata_scroll(api_client, subtest, limit, meta): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): # Make all metadata objects use the same authority and target metadata0 = next(iter(meta)) metadata = { attr.evolve(m, authority=metadata0.authority, target=metadata0.target) for m in meta } # Metadata ids must also be updated as they depend on authority and target metadata = {attr.evolve(m, id=m.compute_hash()) for m in metadata} authority = metadata0.authority archive_data.metadata_authority_add([authority]) archive_data.metadata_fetcher_add(list({m.fetcher for m in metadata})) archive_data.raw_extrinsic_metadata_add(metadata) url = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": str(metadata0.target)}, query_params={ "authority": f"{authority.type.value} {authority.url}", "limit": limit, }, ) results = scroll_results(api_client, url) expected_results = [m.to_dict() for m in metadata] for expected_result in expected_results: del expected_result["id"] del expected_result["metadata"] expected_result["discovery_date"] = expected_result[ "discovery_date" ].isoformat() assert len(results) == len(expected_results) for result in results: del result["metadata_url"] assert result in expected_results _swhid = "swh:1:dir:a2faa28028657859c16ff506924212b33f0e1307" @pytest.mark.parametrize( "status_code,url_args,query_params", [ pytest.param( 200, {"target": _swhid}, {"authority": "forge http://example.org"}, id="minimal working", ), pytest.param( 200, {"target": _swhid}, { "authority": "forge http://example.org", "after": "2021-06-18T09:31:09", "limit": 100, }, id="maximal working", ), pytest.param( 400, {"target": _swhid}, {"authority": "foo http://example.org"}, id="invalid authority type", ), pytest.param( 400, {"target": _swhid}, - {"authority": "forge http://example.org", "after": "yesterday",}, + { + "authority": "forge http://example.org", + "after": "yesterday", + }, id="invalid 'after' format", ), pytest.param( 400, {"target": _swhid}, - {"authority": "forge http://example.org", "limit": "abc",}, + { + "authority": "forge http://example.org", + "limit": "abc", + }, id="invalid 'limit'", ), ], ) def test_api_raw_extrinsic_metadata_check_params( api_client, archive_data, status_code, url_args, query_params ): url = reverse( "api-1-raw-extrinsic-metadata-swhid", url_args=url_args, query_params=query_params, ) check_api_get_responses(api_client, url, status_code=status_code) @given(raw_extrinsic_metadata()) def test_api_raw_extrinsic_metadata_list_authorities(api_client, subtest, metadata): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.metadata_authority_add([metadata.authority]) archive_data.metadata_fetcher_add([metadata.fetcher]) archive_data.raw_extrinsic_metadata_add([metadata]) authority = metadata.authority url = reverse( "api-1-raw-extrinsic-metadata-swhid-authorities", url_args={"target": str(metadata.target)}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_results = [ { "type": authority.type.value, "url": authority.url, "metadata_list_url": "http://testserver" + reverse( "api-1-raw-extrinsic-metadata-swhid", url_args={"target": str(metadata.target)}, query_params={ "authority": f"{authority.type.value} {authority.url}" }, ), } ] assert rv.data == expected_results diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py index 3a61e43b..aec07e32 100644 --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -1,762 +1,795 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta import json from hypothesis import given import pytest from swh.indexer.storage.model import OriginIntrinsicMetadataRow from swh.model.hashutil import hash_to_bytes from swh.model.model import Origin, OriginVisit, OriginVisitStatus from swh.search.exc import SearchQuerySyntaxError from swh.search.interface import PagedResult from swh.storage.exc import StorageAPIError, StorageDBError from swh.storage.utils import now from swh.web.api.utils import enrich_origin, enrich_origin_visit from swh.web.common.exc import BadInputExc from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import reverse from swh.web.tests.api.views.utils import scroll_results from swh.web.tests.data import ( INDEXER_TOOL, ORIGIN_MASTER_REVISION, ORIGIN_METADATA_KEY, ORIGIN_METADATA_VALUE, ) from swh.web.tests.strategies import new_origin, new_snapshots, visit_dates from swh.web.tests.utils import check_api_get_responses def test_api_lookup_origin_visits_raise_error(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "voluntary error to check the bad request middleware." mock_get_origin_visits.side_effect = BadInputExc(err_msg) url = reverse("api-1-origin-visits", url_args={"origin_url": "http://foo"}) rv = check_api_get_responses(api_client, url, status_code=400) assert rv.data == {"exception": "BadInputExc", "reason": err_msg} def test_api_lookup_origin_visits_raise_swh_storage_error_db(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "Storage exploded! Will be back online shortly!" mock_get_origin_visits.side_effect = StorageDBError(err_msg) url = reverse("api-1-origin-visits", url_args={"origin_url": "http://foo"}) rv = check_api_get_responses(api_client, url, status_code=503) assert rv.data == { "exception": "StorageDBError", "reason": "An unexpected error occurred in the backend: %s" % err_msg, } def test_api_lookup_origin_visits_raise_swh_storage_error_api(api_client, mocker): mock_get_origin_visits = mocker.patch("swh.web.api.views.origin.get_origin_visits") err_msg = "Storage API dropped dead! Will resurrect asap!" mock_get_origin_visits.side_effect = StorageAPIError(err_msg) url = reverse("api-1-origin-visits", url_args={"origin_url": "http://foo"}) rv = check_api_get_responses(api_client, url, status_code=503) assert rv.data == { "exception": "StorageAPIError", "reason": "An unexpected error occurred in the api backend: %s" % err_msg, } @given(new_origin(), visit_dates(3), new_snapshots(3)) def test_api_lookup_origin_visits( api_client, subtest, new_origin, visit_dates, new_snapshots ): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=now(), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) all_visits = list(reversed(get_origin_visits(new_origin.to_dict()))) for last_visit, expected_visits in ( (None, all_visits[:2]), (all_visits[1]["visit"], all_visits[2:]), ): url = reverse( "api-1-origin-visits", url_args={"origin_url": new_origin.url}, query_params={"per_page": 2, "last_visit": last_visit}, ) rv = check_api_get_responses(api_client, url, status_code=200) for i in range(len(expected_visits)): expected_visits[i] = enrich_origin_visit( expected_visits[i], with_origin_link=False, with_origin_visit_link=True, request=rv.wsgi_request, ) assert rv.data == expected_visits @given(new_origin(), visit_dates(3), new_snapshots(3)) def test_api_lookup_origin_visits_by_id( api_client, subtest, new_origin, visit_dates, new_snapshots ): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=now(), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) all_visits = list(reversed(get_origin_visits(new_origin.to_dict()))) for last_visit, expected_visits in ( (None, all_visits[:2]), (all_visits[1]["visit"], all_visits[2:4]), ): url = reverse( "api-1-origin-visits", url_args={"origin_url": new_origin.url}, query_params={"per_page": 2, "last_visit": last_visit}, ) rv = check_api_get_responses(api_client, url, status_code=200) for i in range(len(expected_visits)): expected_visits[i] = enrich_origin_visit( expected_visits[i], with_origin_link=False, with_origin_visit_link=True, request=rv.wsgi_request, ) assert rv.data == expected_visits @given(new_origin(), visit_dates(3), new_snapshots(3)) def test_api_lookup_origin_visit( api_client, subtest, new_origin, visit_dates, new_snapshots ): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] visit_id = origin_visit.visit archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=visit_date + timedelta(minutes=5), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "api-1-origin-visit", url_args={"origin_url": new_origin.url, "visit_id": visit_id}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_get_by(new_origin.url, visit_id) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit @given(new_origin()) def test_api_lookup_origin_visit_latest_no_visit(api_client, archive_data, new_origin): archive_data.origin_add([new_origin]) url = reverse("api-1-origin-visit-latest", url_args={"origin_url": new_origin.url}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "No visit for origin %s found" % new_origin.url, } @given(new_origin(), visit_dates(2), new_snapshots(1)) def test_api_lookup_origin_visit_latest( api_client, subtest, new_origin, visit_dates, new_snapshots ): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.origin_add([new_origin]) visit_dates.sort() visit_ids = [] for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] visit_ids.append(origin_visit.visit) archive_data.snapshot_add([new_snapshots[0]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit_ids[0], date=now(), status="full", snapshot=new_snapshots[0].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "api-1-origin-visit-latest", url_args={"origin_url": new_origin.url} ) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_status_get_latest( new_origin.url, type="git" ) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit @given(new_origin(), visit_dates(2), new_snapshots(1)) def test_api_lookup_origin_visit_latest_with_snapshot( api_client, subtest, new_origin, visit_dates, new_snapshots ): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.origin_add([new_origin]) visit_dates.sort() visit_ids = [] for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] visit_ids.append(origin_visit.visit) archive_data.snapshot_add([new_snapshots[0]]) # Add snapshot to the latest visit visit_id = visit_ids[-1] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit_id, date=now(), status="full", snapshot=new_snapshots[0].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "api-1-origin-visit-latest", url_args={"origin_url": new_origin.url}, query_params={"require_snapshot": True}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_status_get_latest( new_origin.url, type="git", require_snapshot=True ) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit def test_api_lookup_origin_visit_not_found(api_client, origin): all_visits = list(reversed(get_origin_visits(origin))) max_visit_id = max([v["visit"] for v in all_visits]) url = reverse( "api-1-origin-visit", url_args={"origin_url": origin["url"], "visit_id": max_visit_id + 1}, ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Origin %s or its visit with id %s not found!" % (origin["url"], max_visit_id + 1), } def test_api_origins_wrong_input(api_client, archive_data): - """Should fail with 400 if the input is deprecated. - - """ + """Should fail with 400 if the input is deprecated.""" # fail if wrong input url = reverse("api-1-origins", query_params={"origin_from": 1}) rv = check_api_get_responses(api_client, url, status_code=400) assert rv.data == { "exception": "BadInputExc", "reason": "Please use the Link header to browse through result", } def test_api_origins(api_client, archive_data): page_result = archive_data.origin_list(limit=10000) origins = page_result.results origin_urls = {origin.url for origin in origins} # Get only one url = reverse("api-1-origins", query_params={"origin_count": 1}) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} <= origin_urls # Get all url = reverse("api-1-origins", query_params={"origin_count": len(origins)}) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(origins) assert {origin["url"] for origin in rv.data} == origin_urls # Get "all + 10" url = reverse("api-1-origins", query_params={"origin_count": len(origins) + 10}) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(origins) assert {origin["url"] for origin in rv.data} == origin_urls @pytest.mark.parametrize("origin_count", [1, 2, 10, 100]) def test_api_origins_scroll(api_client, archive_data, origin_count): page_result = archive_data.origin_list(limit=10000) origins = page_result.results origin_urls = {origin.url for origin in origins} url = reverse("api-1-origins", query_params={"origin_count": origin_count}) results = scroll_results(api_client, url) assert len(results) == len(origins) assert {origin["url"] for origin in results} == origin_urls def test_api_origin_by_url(api_client, archive_data, origin): origin_url = origin["url"] url = reverse("api-1-origin", url_args={"origin_url": origin_url}) rv = check_api_get_responses(api_client, url, status_code=200) expected_origin = archive_data.origin_get([origin_url])[0] expected_origin = enrich_origin(expected_origin, rv.wsgi_request) assert rv.data == expected_origin @given(new_origin()) def test_api_origin_not_found(api_client, new_origin): url = reverse("api-1-origin", url_args={"origin_url": new_origin.url}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Origin with url %s not found!" % new_origin.url, } @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search(api_client, mocker, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } # Search for 'github.com', get only one url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": 1}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} <= expected_origins assert rv.data == [ enrich_origin({"url": origin["url"]}, request=rv.wsgi_request) for origin in rv.data ] # Search for 'github.com', get all url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins assert rv.data == [ enrich_origin({"url": origin["url"]}, request=rv.wsgi_request) for origin in rv.data ] # Search for 'github.com', get more than available url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": 10}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins assert rv.data == [ enrich_origin({"url": origin["url"]}, request=rv.wsgi_request) for origin in rv.data ] @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search_words(api_client, mocker, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } url = reverse( "api-1-origin-search", url_args={"url_pattern": "github com"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins url = reverse( "api-1-origin-search", url_args={"url_pattern": "com github"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins url = reverse( "api-1-origin-search", url_args={"url_pattern": "memononen libtess2"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} == { "https://github.com/memononen/libtess2" } url = reverse( "api-1-origin-search", url_args={"url_pattern": "libtess2 memononen"}, query_params={"limit": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1 assert {origin["url"] for origin in rv.data} == { "https://github.com/memononen/libtess2" } @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search_visit_type(api_client, mocker, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } url = reverse( "api-1-origin-search", - url_args={"url_pattern": "github com",}, + url_args={ + "url_pattern": "github com", + }, query_params={"visit_type": "git"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins url = reverse( "api-1-origin-search", - url_args={"url_pattern": "github com",}, + url_args={ + "url_pattern": "github com", + }, query_params={"visit_type": "foo"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == [] def test_api_origin_search_use_ql(api_client, mocker): expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } ORIGINS = [{"url": origin} for origin in expected_origins] mock_archive_search = mocker.patch("swh.web.common.archive.search") mock_archive_search.origin_search.return_value = PagedResult( - results=ORIGINS, next_page_token=None, + results=ORIGINS, + next_page_token=None, ) query = "origin : 'github.com'" url = reverse( "api-1-origin-search", url_args={"url_pattern": query}, query_params={"visit_type": "git", "use_ql": "true"}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert {origin["url"] for origin in rv.data} == expected_origins mock_archive_search.origin_search.assert_called_with( query=query, page_token=None, with_visit=False, visit_types=["git"], limit=70 ) def test_api_origin_search_ql_syntax_error(api_client, mocker): mock_archive_search = mocker.patch("swh.web.common.archive.search") mock_archive_search.origin_search.side_effect = SearchQuerySyntaxError( "Invalid syntax" ) query = "this is not a valid query" url = reverse( "api-1-origin-search", url_args={"url_pattern": query}, query_params={"visit_type": "git", "use_ql": "true"}, ) rv = check_api_get_responses(api_client, url, status_code=400) assert rv.data == { "exception": "BadInputExc", "reason": "Syntax error in search query: Invalid syntax", } mock_archive_search.origin_search.assert_called_with( query=query, page_token=None, with_visit=False, visit_types=["git"], limit=70 ) @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) @pytest.mark.parametrize("limit", [1, 2, 3, 10]) def test_api_origin_search_scroll(api_client, archive_data, mocker, limit, backend): if backend != "swh-search": # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) expected_origins = { "https://github.com/wcoder/highlightjs-line-numbers.js", "https://github.com/memononen/libtess2", } url = reverse( "api-1-origin-search", url_args={"url_pattern": "github.com"}, query_params={"limit": limit}, ) results = scroll_results(api_client, url) assert {origin["url"] for origin in results} == expected_origins @pytest.mark.parametrize("backend", ["swh-search", "swh-storage"]) def test_api_origin_search_limit(api_client, archive_data, tests_data, mocker, backend): if backend == "swh-search": tests_data["search"].origin_update( [{"url": "http://foobar/{}".format(i)} for i in range(2000)] ) else: # equivalent to not configuring search in the config mocker.patch("swh.web.common.archive.search", None) archive_data.origin_add( [Origin(url="http://foobar/{}".format(i)) for i in range(2000)] ) url = reverse( "api-1-origin-search", url_args={"url_pattern": "foobar"}, query_params={"limit": 1050}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == 1000 @pytest.mark.parametrize("backend", ["swh-search", "swh-indexer-storage"]) def test_api_origin_metadata_search(api_client, mocker, backend): mock_config = mocker.patch("swh.web.common.archive.config") mock_config.get_config.return_value = { "search_config": {"metadata_backend": backend} } url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE} ) rv = check_api_get_responses(api_client, url, status_code=200) rv.data = sorted(rv.data, key=lambda d: d["url"]) expected_data = sorted( [ { "url": origin_url, "metadata": { "from_revision": ORIGIN_MASTER_REVISION[origin_url], "tool": { "name": INDEXER_TOOL["tool_name"], "version": INDEXER_TOOL["tool_version"], "configuration": INDEXER_TOOL["tool_configuration"], "id": INDEXER_TOOL["id"], }, "mappings": [], }, } for origin_url in sorted(ORIGIN_MASTER_REVISION.keys()) ], key=lambda d: d["url"], ) for i in range(len(expected_data)): expected = expected_data[i] response = rv.data[i] metadata = response["metadata"].pop("metadata") assert any( [ORIGIN_METADATA_VALUE in json.dumps(val) for val in metadata.values()] ) assert response == expected def test_api_origin_metadata_search_limit(api_client, mocker): mock_idx_storage = mocker.patch("swh.web.common.archive.idx_storage") oimsft = mock_idx_storage.origin_intrinsic_metadata_search_fulltext oimsft.side_effect = lambda conjunction, limit: [ OriginIntrinsicMetadataRow( id=origin_url, from_revision=hash_to_bytes(master_rev), indexer_configuration_id=INDEXER_TOOL["id"], metadata={ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE}, mappings=[], ) for origin_url, master_rev in ORIGIN_MASTER_REVISION.items() ] url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE} ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(ORIGIN_MASTER_REVISION) oimsft.assert_called_with(conjunction=[ORIGIN_METADATA_VALUE], limit=70) url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE, "limit": 10}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(ORIGIN_MASTER_REVISION) oimsft.assert_called_with(conjunction=[ORIGIN_METADATA_VALUE], limit=10) url = reverse( "api-1-origin-metadata-search", query_params={"fulltext": ORIGIN_METADATA_VALUE, "limit": 987}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert len(rv.data) == len(ORIGIN_MASTER_REVISION) oimsft.assert_called_with(conjunction=[ORIGIN_METADATA_VALUE], limit=100) def test_api_origin_intrinsic_metadata(api_client, origin): url = reverse( "api-origin-intrinsic-metadata", url_args={"origin_url": origin["url"]} ) rv = check_api_get_responses(api_client, url, status_code=200) assert ORIGIN_METADATA_KEY in rv.data assert rv.data[ORIGIN_METADATA_KEY] == ORIGIN_METADATA_VALUE def test_api_origin_metadata_search_invalid(api_client, mocker): mock_idx_storage = mocker.patch("swh.web.common.archive.idx_storage") url = reverse("api-1-origin-metadata-search") check_api_get_responses(api_client, url, status_code=400) mock_idx_storage.assert_not_called() @pytest.mark.parametrize("backend", ["swh-counters", "swh-storage"]) def test_api_stat_counters(api_client, mocker, backend): mock_config = mocker.patch("swh.web.common.archive.config") mock_config.get_config.return_value = {"counters_backend": backend} url = reverse("api-1-stat-counters") rv = check_api_get_responses(api_client, url, status_code=200) counts = json.loads(rv.content) for obj in ["content", "origin", "release", "directory", "revision"]: assert counts.get(obj, 0) > 0 diff --git a/swh/web/tests/api/views/test_origin_save.py b/swh/web/tests/api/views/test_origin_save.py index 9cdfe6e7..29359585 100644 --- a/swh/web/tests/api/views/test_origin_save.py +++ b/swh/web/tests/api/views/test_origin_save.py @@ -1,578 +1,604 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta import uuid import pytest from django.core.exceptions import ObjectDoesNotExist from django.utils import timezone from swh.web.auth.utils import API_SAVE_ORIGIN_PERMISSION, SWH_AMBASSADOR_PERMISSION from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_FAILED, VISIT_STATUS_FULL, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, ) from swh.web.common.typing import OriginExistenceCheckInfo from swh.web.common.utils import reverse from swh.web.settings.tests import save_origin_rate_post from swh.web.tests.utils import ( check_api_get_responses, check_api_post_response, check_api_post_responses, create_django_permission, ) pytestmark = pytest.mark.django_db @pytest.fixture(autouse=True) def populated_db(): SaveAuthorizedOrigin.objects.create(url="https://github.com/"), SaveAuthorizedOrigin.objects.create(url="https://gitlab.com/"), SaveUnauthorizedOrigin.objects.create(url="https://github.com/user/illegal_repo") SaveUnauthorizedOrigin.objects.create(url="https://gitlab.com/user_to_exclude") def test_invalid_visit_type(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={ "visit_type": "foo", "origin_url": "https://github.com/torvalds/linux", }, ) check_api_get_responses(api_client, url, status_code=400) def test_invalid_origin_url(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": "bar"} ) check_api_get_responses(api_client, url, status_code=400) def check_created_save_request_status( api_client, mocker, origin_url, expected_request_status, expected_task_status=None, visit_date=None, ): mock_origin_exists = mocker.patch("swh.web.common.origin_save.origin_exists") mock_origin_exists.return_value = OriginExistenceCheckInfo( origin_url=origin_url, exists=True, last_modified=None, content_length=None ) url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url} ) mock_visit_date = mocker.patch( ("swh.web.common.origin_save._get_visit_info_for_save_request") ) mock_visit_date.return_value = (visit_date, None) if expected_request_status != SAVE_REQUEST_REJECTED: response = check_api_post_responses(api_client, url, data=None, status_code=200) assert response.data["save_request_status"] == expected_request_status assert response.data["save_task_status"] == expected_task_status else: check_api_post_responses(api_client, url, data=None, status_code=403) def check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status, expected_task_status, scheduler_task_status="next_run_not_scheduled", scheduler_task_run_status=None, visit_date=None, visit_status=None, ): if expected_task_status != SAVE_TASK_NOT_CREATED: task = dict(swh_scheduler.search_tasks()[0].items()) backend_id = str(uuid.uuid4()) if scheduler_task_status != "next_run_not_scheduled": swh_scheduler.schedule_task_run(task["id"], backend_id) if scheduler_task_run_status is not None: swh_scheduler.start_task_run(backend_id) task_run = dict( swh_scheduler.end_task_run(backend_id, scheduler_task_run_status).items() ) url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url} ) mock_visit_date = mocker.patch( ("swh.web.common.origin_save._get_visit_info_for_save_request") ) mock_visit_date.return_value = (visit_date, visit_status) response = check_api_get_responses(api_client, url, status_code=200) save_request_data = response.data[0] assert save_request_data["save_request_status"] == expected_request_status assert save_request_data["save_task_status"] == expected_task_status assert save_request_data["visit_status"] == visit_status if scheduler_task_run_status is not None: # Check that save task status is still available when # the scheduler task has been archived swh_scheduler.delete_archived_tasks( [{"task_id": task["id"], "task_run_id": task_run["id"]}] ) response = check_api_get_responses(api_client, url, status_code=200) save_request_data = response.data[0] assert save_request_data["save_task_status"] == expected_task_status assert save_request_data["visit_status"] == visit_status def test_save_request_rejected(api_client, mocker, swh_scheduler): origin_url = "https://github.com/user/illegal_repo" check_created_save_request_status( - api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_REJECTED, + api_client, + mocker, + origin_url, + expected_request_status=SAVE_REQUEST_REJECTED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_REJECTED, expected_task_status=SAVE_TASK_NOT_CREATED, ) def test_save_request_pending(api_client, mocker, swh_scheduler): origin_url = "https://unkwownforge.com/user/repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_PENDING, expected_task_status=SAVE_TASK_NOT_CREATED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_PENDING, expected_task_status=SAVE_TASK_NOT_CREATED, ) def test_save_request_scheduled(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, scheduler_task_status="next_run_scheduled", scheduler_task_run_status="scheduled", ) def test_save_request_completed(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SUCCEEDED, scheduler_task_status="completed", scheduler_task_run_status="eventful", visit_date=None, ) def test_save_request_completed_visit_status(api_client, mocker, swh_scheduler): origin_url = "https://github.com/Kitware/CMake" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) visit_date = datetime.now(tz=timezone.utc) + timedelta(hours=1) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SUCCEEDED, scheduler_task_status="completed", scheduler_task_run_status="eventful", visit_date=visit_date, visit_status=VISIT_STATUS_FULL, ) def test_save_request_failed(api_client, mocker, swh_scheduler): origin_url = "https://gitlab.com/inkscape/inkscape" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_FAILED, scheduler_task_status="disabled", scheduler_task_run_status="failed", visit_status=VISIT_STATUS_FAILED, ) def test_create_save_request_no_duplicate(api_client, mocker, swh_scheduler): origin_url = "https://github.com/webpack/webpack" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 1 check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, scheduler_task_status="next_run_scheduled", scheduler_task_run_status="scheduled", ) check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_SCHEDULED, ) sors = list( SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url) ) assert len(sors) == 1 def test_get_save_requests_unknown_origin(api_client, swh_scheduler): unknown_origin_url = "https://gitlab.com/foo/bar" url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": unknown_origin_url}, ) response = check_api_get_responses(api_client, url, status_code=404) assert response.data == { "exception": "NotFoundExc", "reason": ( "No save requests found for visit of type git on origin with url %s." ) % unknown_origin_url, } _visit_type = "git" _origin_url = "https://github.com/python/cpython" def test_save_requests_rate_limit(api_client, swh_scheduler): url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) for _ in range(save_origin_rate_post): check_api_post_response(api_client, url, status_code=200) check_api_post_response(api_client, url, status_code=429) def test_save_requests_no_rate_limit_if_permission( api_client, regular_user, swh_scheduler ): regular_user.user_permissions.add( create_django_permission(API_SAVE_ORIGIN_PERMISSION) ) assert regular_user.has_perm(API_SAVE_ORIGIN_PERMISSION) api_client.force_login(regular_user) url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) for _ in range(save_origin_rate_post): check_api_post_response(api_client, url, status_code=200) check_api_post_response(api_client, url, status_code=200) def test_save_request_unknown_repo_with_permission( api_client, regular_user, mocker, swh_scheduler ): regular_user.user_permissions.add( create_django_permission(API_SAVE_ORIGIN_PERMISSION) ) assert regular_user.has_perm(API_SAVE_ORIGIN_PERMISSION) api_client.force_login(regular_user) origin_url = "https://unkwownforge.org/user/repo" check_created_save_request_status( api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) check_save_request_status( api_client, mocker, swh_scheduler, origin_url, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) def test_save_request_form_server_error(api_client, mocker): create_save_origin_request = mocker.patch( "swh.web.api.views.origin_save.create_save_origin_request" ) create_save_origin_request.side_effect = Exception("Server error") url = reverse( "api-1-save-origin", url_args={"visit_type": _visit_type, "origin_url": _origin_url}, ) check_api_post_responses(api_client, url, status_code=500) @pytest.fixture def origin_to_review(): return "https://git.example.org/user/project" def test_create_save_request_pending_review_anonymous_user( api_client, origin_to_review, swh_scheduler ): url = reverse( "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_to_review}, ) response = check_api_post_responses(api_client, url, status_code=200) assert response.data["save_request_status"] == SAVE_REQUEST_PENDING with pytest.raises(ObjectDoesNotExist): SaveAuthorizedOrigin.objects.get(url=origin_to_review) def test_create_save_request_archives_with_ambassador_user( - api_client, keycloak_oidc, requests_mock, swh_scheduler, + api_client, + keycloak_oidc, + requests_mock, + swh_scheduler, ): swh_scheduler.add_load_archive_task_type() keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") originUrl = "https://somewhere.org/simple" artifact_version = "1.2.3" artifact_filename = f"tarball-{artifact_version}.tar.gz" artifact_url = f"{originUrl}/{artifact_filename}" content_length = "100" last_modified = "Sun, 21 Aug 2011 16:26:32 GMT" requests_mock.head( artifact_url, status_code=200, - headers={"content-length": content_length, "last-modified": last_modified,}, + headers={ + "content-length": content_length, + "last-modified": last_modified, + }, ) url = reverse( "api-1-save-origin", - url_args={"visit_type": "archives", "origin_url": originUrl,}, + url_args={ + "visit_type": "archives", + "origin_url": originUrl, + }, ) response = check_api_post_response( api_client, url, status_code=200, data={ "archives_data": [ - {"artifact_url": artifact_url, "artifact_version": artifact_version,} + { + "artifact_url": artifact_url, + "artifact_version": artifact_version, + } ] }, ) assert response.data["save_request_status"] == SAVE_REQUEST_ACCEPTED assert SaveAuthorizedOrigin.objects.get(url=originUrl) def test_create_save_request_archives_missing_artifacts_data( api_client, keycloak_oidc, swh_scheduler ): swh_scheduler.add_load_archive_task_type() keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") originUrl = "https://somewhere.org/simple" url = reverse( "api-1-save-origin", - url_args={"visit_type": "archives", "origin_url": originUrl,}, + url_args={ + "visit_type": "archives", + "origin_url": originUrl, + }, ) - response = check_api_post_response(api_client, url, status_code=400, data={},) + response = check_api_post_response( + api_client, + url, + status_code=400, + data={}, + ) assert "Artifacts data are missing" in response.data["reason"] response = check_api_post_response( api_client, url, status_code=400, data={"archives_data": [{"artifact_url": "", "arttifact_version": "1.0"}]}, ) assert "Missing url or version for an artifact to load" in response.data["reason"] def test_create_save_request_archives_accepted_ambassador_user( api_client, origin_to_review, keycloak_oidc, mocker, swh_scheduler ): keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION] oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") check_created_save_request_status( api_client, mocker, origin_to_review, expected_request_status=SAVE_REQUEST_ACCEPTED, expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED, ) assert SaveAuthorizedOrigin.objects.get(url=origin_to_review) def test_create_save_request_anonymous_user_no_user_id(api_client, swh_scheduler): origin_url = "https://some.git.hosters/user/repo" url = reverse( - "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, + "api-1-save-origin", + url_args={"visit_type": "git", "origin_url": origin_url}, ) check_api_post_responses(api_client, url, status_code=200) sor = SaveOriginRequest.objects.get(origin_url=origin_url) assert sor.user_ids is None def test_create_save_request_authenticated_user_id( api_client, keycloak_oidc, swh_scheduler ): oidc_profile = keycloak_oidc.login() api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") origin_url = "https://some.git.hosters/user/repo2" url = reverse( - "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, + "api-1-save-origin", + url_args={"visit_type": "git", "origin_url": origin_url}, ) response = check_api_post_response(api_client, url, status_code=200) assert response.wsgi_request.user.id is not None user_id = str(response.wsgi_request.user.id) sor = SaveOriginRequest.objects.get(user_ids=f'"{user_id}"') assert sor.user_ids == f'"{user_id}"' def test_create_pending_save_request_multiple_authenticated_users( api_client, swh_scheduler, regular_user, regular_user2 ): origin_url = "https://some.git.hosters/user/repo3" url = reverse( - "api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}, + "api-1-save-origin", + url_args={"visit_type": "git", "origin_url": origin_url}, ) api_client.force_login(regular_user) check_api_post_response(api_client, url, status_code=200) api_client.force_login(regular_user2) check_api_post_response(api_client, url, status_code=200) assert SaveOriginRequest.objects.get(user_ids__contains=f'"{regular_user.id}"') assert SaveOriginRequest.objects.get(user_ids__contains=f'"{regular_user2.id}"') diff --git a/swh/web/tests/api/views/test_revision.py b/swh/web/tests/api/views/test_revision.py index 7abb388c..47717683 100644 --- a/swh/web/tests/api/views/test_revision.py +++ b/swh/web/tests/api/views/test_revision.py @@ -1,228 +1,230 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given from swh.model.from_disk import DentryPerms from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( Directory, DirectoryEntry, Revision, RevisionType, TimestampWithTimezone, ) from swh.web.api.utils import enrich_content, enrich_directory_entry, enrich_revision from swh.web.common.utils import reverse from swh.web.tests.data import random_sha1 from swh.web.tests.strategies import new_person, new_swh_date from swh.web.tests.utils import check_api_get_responses, check_http_get_response def test_api_revision(api_client, archive_data, revision): url = reverse("api-1-revision", url_args={"sha1_git": revision}) rv = check_api_get_responses(api_client, url, status_code=200) expected_revision = archive_data.revision_get(revision) enrich_revision(expected_revision, rv.wsgi_request) assert rv.data == expected_revision def test_api_revision_not_found(api_client): unknown_revision_ = random_sha1() url = reverse("api-1-revision", url_args={"sha1_git": unknown_revision_}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Revision with sha1_git %s not found." % unknown_revision_, } def test_api_revision_raw_ok(api_client, archive_data, revision): url = reverse("api-1-revision-raw-message", url_args={"sha1_git": revision}) expected_message = archive_data.revision_get(revision)["message"] rv = check_http_get_response(api_client, url, status_code=200) assert rv["Content-Type"] == "application/octet-stream" assert rv.content == expected_message.encode() def test_api_revision_raw_ko_no_rev(api_client): unknown_revision_ = random_sha1() url = reverse( "api-1-revision-raw-message", url_args={"sha1_git": unknown_revision_} ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Revision with sha1_git %s not found." % unknown_revision_, } def test_api_revision_log(api_client, archive_data, revision): limit = 10 url = reverse( "api-1-revision-log", url_args={"sha1_git": revision}, query_params={"limit": limit}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_log = archive_data.revision_log(revision, limit=limit) expected_log = list( map(enrich_revision, expected_log, [rv.wsgi_request] * len(expected_log)) ) assert rv.data == expected_log def test_api_revision_log_not_found(api_client): unknown_revision_ = random_sha1() url = reverse("api-1-revision-log", url_args={"sha1_git": unknown_revision_}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Revision with sha1_git %s not found." % unknown_revision_, } assert not rv.has_header("Link") def test_api_revision_directory_ko_not_found(api_client): sha1_git = random_sha1() url = reverse("api-1-revision-directory", {"sha1_git": sha1_git}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": f"Revision with sha1_git {sha1_git} not found.", } def test_api_revision_directory_ok_returns_dir_entries( api_client, archive_data, revision ): url = reverse("api-1-revision-directory", {"sha1_git": revision}) rv = check_api_get_responses(api_client, url, status_code=200) rev_data = archive_data.revision_get(revision) dir_content = archive_data.directory_ls(rev_data["directory"]) dir_content = [ enrich_directory_entry(dir_entry, request=rv.wsgi_request) for dir_entry in dir_content ] assert rv.data == { "content": dir_content, "path": ".", "type": "dir", "revision": revision, } @given(new_person(), new_swh_date()) def test_api_revision_directory_ok_returns_content( api_client, archive_data, content, person, date ): content_path = "foo" _dir = Directory( entries=( DirectoryEntry( name=content_path.encode(), type="file", target=hash_to_bytes(content["sha1_git"]), perms=DentryPerms.content, ), ) ) archive_data.directory_add([_dir]) revision = Revision( directory=_dir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([revision]) revision_id = hash_to_hex(revision.id) cnt_data = archive_data.content_get(content["sha1"]) url = reverse( - "api-1-revision-directory", {"sha1_git": revision_id, "dir_path": content_path}, + "api-1-revision-directory", + {"sha1_git": revision_id, "dir_path": content_path}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "content": enrich_content(cnt_data, request=rv.wsgi_request), "path": content_path, "type": "file", "revision": revision_id, } @given(new_person(), new_swh_date()) def test_api_revision_directory_ok_returns_revision( api_client, archive_data, revision, person, date ): rev_path = "foo" _dir = Directory( entries=( DirectoryEntry( name=rev_path.encode(), type="rev", target=hash_to_bytes(revision), perms=DentryPerms.revision, ), ) ) archive_data.directory_add([_dir]) rev = Revision( directory=_dir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([rev]) revision_id = hash_to_hex(rev.id) rev_data = archive_data.revision_get(revision) url = reverse( - "api-1-revision-directory", {"sha1_git": revision_id, "dir_path": rev_path}, + "api-1-revision-directory", + {"sha1_git": revision_id, "dir_path": rev_path}, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "content": enrich_revision(rev_data, request=rv.wsgi_request), "path": rev_path, "type": "rev", "revision": revision_id, } def test_api_revision_uppercase(api_client, revision): url = reverse( "api-1-revision-uppercase-checksum", url_args={"sha1_git": revision.upper()} ) resp = check_http_get_response(api_client, url, status_code=302) redirect_url = reverse("api-1-revision", url_args={"sha1_git": revision}) assert resp["location"] == redirect_url diff --git a/swh/web/tests/api/views/test_vault.py b/swh/web/tests/api/views/test_vault.py index 5945ce80..fbf058a7 100644 --- a/swh/web/tests/api/views/test_vault.py +++ b/swh/web/tests/api/views/test_vault.py @@ -1,325 +1,330 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import re import pytest from swh.model.swhids import CoreSWHID from swh.vault.exc import NotFoundExc from swh.web.common.utils import reverse from swh.web.tests.utils import ( check_api_get_responses, check_api_post_responses, check_http_get_response, check_http_post_response, ) ##################### # Current API: def test_api_vault_cook(api_client, mocker, directory, revision): mock_archive = mocker.patch("swh.web.api.views.vault.archive") for bundle_type, swhid, content_type, in ( ("flat", f"swh:1:dir:{directory}", "application/gzip"), ("gitfast", f"swh:1:rev:{revision}", "application/gzip"), ("git_bare", f"swh:1:rev:{revision}", "application/x-tar"), ): swhid = CoreSWHID.from_string(swhid) fetch_url = reverse( f"api-1-vault-fetch-{bundle_type.replace('_', '-')}", url_args={"swhid": str(swhid)}, ) stub_cook = { "type": bundle_type, "progress_msg": None, "task_id": 1, "task_status": "done", "swhid": swhid, } stub_fetch = b"content" mock_archive.vault_cook.return_value = stub_cook mock_archive.vault_fetch.return_value = stub_fetch email = "test@test.mail" url = reverse( f"api-1-vault-cook-{bundle_type.replace('_', '-')}", url_args={"swhid": str(swhid)}, query_params={"email": email}, ) rv = check_api_post_responses(api_client, url, data=None, status_code=200) assert rv.data == { "fetch_url": rv.wsgi_request.build_absolute_uri(fetch_url), "progress_message": None, "id": 1, "status": "done", "swhid": str(swhid), } mock_archive.vault_cook.assert_called_with(bundle_type, swhid, email) rv = check_http_get_response(api_client, fetch_url, status_code=200) assert rv["Content-Type"] == content_type assert rv.content == stub_fetch mock_archive.vault_fetch.assert_called_with(bundle_type, swhid) def test_api_vault_cook_notfound( api_client, mocker, directory, revision, unknown_directory, unknown_revision ): mock_vault = mocker.patch("swh.web.common.archive.vault") mock_vault.cook.side_effect = NotFoundExc("object not found") mock_vault.fetch.side_effect = NotFoundExc("cooked archive not found") mock_vault.progress.side_effect = NotFoundExc("cooking request not found") for bundle_type, swhid in ( ("flat", f"swh:1:dir:{directory}"), ("gitfast", f"swh:1:rev:{revision}"), ("git_bare", f"swh:1:rev:{revision}"), ): swhid = CoreSWHID.from_string(swhid) url = reverse( f"api-1-vault-cook-{bundle_type.replace('_', '-')}", url_args={"swhid": str(swhid)}, ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"Cooking of {swhid} was never requested." mock_vault.progress.assert_called_with(bundle_type, swhid) for bundle_type, swhid in ( ("flat", f"swh:1:dir:{unknown_directory}"), ("gitfast", f"swh:1:rev:{unknown_revision}"), ("git_bare", f"swh:1:rev:{unknown_revision}"), ): swhid = CoreSWHID.from_string(swhid) url = reverse( f"api-1-vault-cook-{bundle_type.replace('_', '-')}", url_args={"swhid": str(swhid)}, ) rv = check_api_post_responses(api_client, url, data=None, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"{swhid} not found." mock_vault.cook.assert_called_with(bundle_type, swhid, email=None) fetch_url = reverse( f"api-1-vault-fetch-{bundle_type.replace('_', '-')}", url_args={"swhid": str(swhid)}, ) rv = check_api_get_responses(api_client, fetch_url, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"Cooked archive for {swhid} not found." mock_vault.fetch.assert_called_with(bundle_type, swhid) @pytest.mark.parametrize("bundle_type", ["flat", "gitfast", "git_bare"]) def test_api_vault_cook_error_content(api_client, mocker, bundle_type): swhid = "swh:1:cnt:" + "0" * 40 email = "test@test.mail" url = reverse( f"api-1-vault-cook-{bundle_type.replace('_', '-')}", url_args={"swhid": swhid}, query_params={"email": email}, ) rv = check_api_post_responses(api_client, url, data=None, status_code=400) assert rv.data == { "exception": "BadInputExc", "reason": ( "Content objects do not need to be cooked, " "use `/api/1/content/raw/` instead." ), } @pytest.mark.parametrize( "bundle_type,swhid_type,hint", [ ("flat", "rev", True), ("flat", "rel", False), ("flat", "snp", False), ("gitfast", "dir", True), ("gitfast", "rel", False), ("gitfast", "snp", False), ("git_bare", "dir", True), ("git_bare", "rel", False), ("git_bare", "snp", False), ], ) def test_api_vault_cook_error(api_client, mocker, bundle_type, swhid_type, hint): swhid = f"swh:1:{swhid_type}:" + "0" * 40 email = "test@test.mail" url = reverse( f"api-1-vault-cook-{bundle_type.replace('_', '-')}", url_args={"swhid": swhid}, query_params={"email": email}, ) rv = check_api_post_responses(api_client, url, data=None, status_code=400) assert rv.data["exception"] == "BadInputExc" if hint: assert re.match( r"Only .* can be cooked as .* bundles\. Use .*", rv.data["reason"] ) else: assert re.match(r"Only .* can be cooked as .* bundles\.", rv.data["reason"]) ##################### # Legacy API: def test_api_vault_cook_legacy(api_client, mocker, directory, revision): mock_archive = mocker.patch("swh.web.api.views.vault.archive") for obj_type, bundle_type, response_obj_type, obj_id in ( ("directory", "flat", "directory", directory), ("revision_gitfast", "gitfast", "revision", revision), ): swhid = CoreSWHID.from_string(f"swh:1:{obj_type[:3]}:{obj_id}") fetch_url = reverse( - f"api-1-vault-fetch-{bundle_type}", url_args={"swhid": str(swhid)}, + f"api-1-vault-fetch-{bundle_type}", + url_args={"swhid": str(swhid)}, ) stub_cook = { "type": obj_type, "progress_msg": None, "task_id": 1, "task_status": "done", "swhid": swhid, "obj_type": response_obj_type, "obj_id": obj_id, } stub_fetch = b"content" mock_archive.vault_cook.return_value = stub_cook mock_archive.vault_fetch.return_value = stub_fetch email = "test@test.mail" url = reverse( f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, query_params={"email": email}, ) rv = check_api_post_responses(api_client, url, data=None, status_code=200) assert rv.data == { "fetch_url": rv.wsgi_request.build_absolute_uri(fetch_url), "progress_message": None, "id": 1, "status": "done", "swhid": str(swhid), "obj_type": response_obj_type, "obj_id": obj_id, } mock_archive.vault_cook.assert_called_with(bundle_type, swhid, email) rv = check_http_get_response(api_client, fetch_url, status_code=200) assert rv["Content-Type"] == "application/gzip" assert rv.content == stub_fetch mock_archive.vault_fetch.assert_called_with(bundle_type, swhid) def test_api_vault_cook_uppercase_hash_legacy(api_client, directory, revision): for obj_type, obj_id in ( ("directory", directory), ("revision_gitfast", revision), ): url = reverse( f"api-1-vault-cook-{obj_type}-uppercase-checksum", url_args={f"{obj_type[:3]}_id": obj_id.upper()}, ) rv = check_http_post_response( api_client, url, data={"email": "test@test.mail"}, status_code=302 ) redirect_url = reverse( f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id} ) assert rv["location"] == redirect_url fetch_url = reverse( f"api-1-vault-fetch-{obj_type}-uppercase-checksum", url_args={f"{obj_type[:3]}_id": obj_id.upper()}, ) rv = check_http_get_response(api_client, fetch_url, status_code=302) redirect_url = reverse( - f"api-1-vault-fetch-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, + f"api-1-vault-fetch-{obj_type}", + url_args={f"{obj_type[:3]}_id": obj_id}, ) assert rv["location"] == redirect_url def test_api_vault_cook_notfound_legacy( api_client, mocker, directory, revision, unknown_directory, unknown_revision ): mock_vault = mocker.patch("swh.web.common.archive.vault") mock_vault.cook.side_effect = NotFoundExc("object not found") mock_vault.fetch.side_effect = NotFoundExc("cooked archive not found") mock_vault.progress.side_effect = NotFoundExc("cooking request not found") for obj_type, bundle_type, obj_id in ( ("directory", "flat", directory), ("revision_gitfast", "gitfast", revision), ): url = reverse( - f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, + f"api-1-vault-cook-{obj_type}", + url_args={f"{obj_type[:3]}_id": obj_id}, ) swhid = CoreSWHID.from_string(f"swh:1:{obj_type[:3]}:{obj_id}") rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"Cooking of {swhid} was never requested." mock_vault.progress.assert_called_with(bundle_type, swhid) for obj_type, bundle_type, obj_id in ( ("directory", "flat", unknown_directory), ("revision_gitfast", "gitfast", unknown_revision), ): swhid = CoreSWHID.from_string(f"swh:1:{obj_type[:3]}:{obj_id}") url = reverse( f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id} ) rv = check_api_post_responses(api_client, url, data=None, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"{swhid} not found." mock_vault.cook.assert_called_with(bundle_type, swhid, email=None) fetch_url = reverse( - f"api-1-vault-fetch-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, + f"api-1-vault-fetch-{obj_type}", + url_args={f"{obj_type[:3]}_id": obj_id}, ) # Redirected to the current 'fetch' url rv = check_http_get_response(api_client, fetch_url, status_code=302) redirect_url = reverse( - f"api-1-vault-fetch-{bundle_type}", url_args={"swhid": str(swhid)}, + f"api-1-vault-fetch-{bundle_type}", + url_args={"swhid": str(swhid)}, ) assert rv["location"] == redirect_url rv = check_api_get_responses(api_client, redirect_url, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"Cooked archive for {swhid} not found." mock_vault.fetch.assert_called_with(bundle_type, swhid) diff --git a/swh/web/tests/auth/test_mailmap.py b/swh/web/tests/auth/test_mailmap.py index a99a4a04..a948ece2 100644 --- a/swh/web/tests/auth/test_mailmap.py +++ b/swh/web/tests/auth/test_mailmap.py @@ -1,594 +1,600 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from io import StringIO import json from typing import Dict from psycopg2.extras import execute_values import pytest from django.core.management import call_command from django.db import transaction from swh.model.model import Person from swh.web.auth.models import UserMailmap, UserMailmapEvent from swh.web.common.utils import reverse from swh.web.tests.utils import ( check_api_post_response, check_http_get_response, check_http_post_response, ) @pytest.mark.django_db(transaction=True) @pytest.mark.parametrize("view_name", ["profile-mailmap-add", "profile-mailmap-update"]) def test_mailmap_endpoints_anonymous_user(api_client, view_name): url = reverse(view_name) check_api_post_response(api_client, url, status_code=403) @pytest.mark.django_db(transaction=True) def test_mailmap_endpoints_user_with_permission( api_client, mailmap_user, mailmap_admin ): for user, name in ((mailmap_user, "bar"), (mailmap_admin, "baz")): UserMailmapEvent.objects.all().delete() api_client.force_login(user) request_data = {"from_email": f"{name}@example.org", "display_name": name} for view_name in ("profile-mailmap-add", "profile-mailmap-update"): url = reverse(view_name) check_api_post_response( - api_client, url, data=request_data, status_code=200, + api_client, + url, + data=request_data, + status_code=200, ) # FIXME: use check_api_get_responses; currently this crashes without # content_type="application/json" resp = check_http_get_response( api_client, reverse("profile-mailmap-list"), status_code=200, content_type="application/json", ).data assert len(resp) == 1 assert resp[0]["from_email"] == f"{name}@example.org" assert resp[0]["display_name"] == name events = UserMailmapEvent.objects.order_by("timestamp").all() assert len(events) == 2 assert events[0].request_type == "add" assert json.loads(events[0].request) == request_data assert events[1].request_type == "update" assert json.loads(events[1].request) == request_data @pytest.mark.django_db(transaction=True) def test_mailmap_add_duplicate(api_client, mailmap_user, mailmap_admin): for user, name in ((mailmap_user, "foo"), (mailmap_admin, "bar")): api_client.force_login(user) check_api_post_response( api_client, reverse("profile-mailmap-add"), data={"from_email": f"{name}@example.org", "display_name": name}, status_code=200, ) check_api_post_response( api_client, reverse("profile-mailmap-add"), data={"from_email": f"{name}@example.org", "display_name": name}, status_code=400, ) @pytest.mark.django_db(transaction=True) def test_mailmap_add_full(api_client, mailmap_user, mailmap_admin): for user, name in ((mailmap_user, "foo"), (mailmap_admin, "bar")): api_client.force_login(user) UserMailmapEvent.objects.all().delete() request_data = { "from_email": f"{name}@example.org", "from_email_verified": True, "from_email_verification_request_date": "2021-02-07T14:04:15Z", "display_name": name, "display_name_activated": True, "to_email": "baz@example.org", "to_email_verified": True, "to_email_verification_request_date": "2021-02-07T15:54:59Z", } check_api_post_response( api_client, reverse("profile-mailmap-add"), data=request_data, status_code=200, ) resp = check_http_get_response( api_client, reverse("profile-mailmap-list"), status_code=200, content_type="application/json", ).data assert len(resp) == 1 assert resp[0].items() >= request_data.items() events = UserMailmapEvent.objects.all() assert len(events) == 1 assert events[0].request_type == "add" assert json.loads(events[0].request) == request_data assert events[0].successful @pytest.mark.django_db(transaction=True) def test_mailmap_endpoints_error_response(api_client, mailmap_user, mailmap_admin): for user in (mailmap_user, mailmap_admin): api_client.force_login(user) UserMailmapEvent.objects.all().delete() url = reverse("profile-mailmap-add") resp = check_api_post_response(api_client, url, status_code=400) assert b"from_email" in resp.content url = reverse("profile-mailmap-update") resp = check_api_post_response(api_client, url, status_code=400) assert b"from_email" in resp.content events = UserMailmapEvent.objects.order_by("timestamp").all() assert len(events) == 2 assert events[0].request_type == "add" assert json.loads(events[0].request) == {} assert not events[0].successful assert events[1].request_type == "update" assert json.loads(events[1].request) == {} assert not events[1].successful @pytest.mark.django_db(transaction=True) def test_mailmap_update(api_client, mailmap_user, mailmap_admin): for user, name in ((mailmap_user, "foo"), (mailmap_admin, "bar")): api_client.force_login(user) UserMailmapEvent.objects.all().delete() before_add = datetime.datetime.now(tz=datetime.timezone.utc) check_api_post_response( api_client, reverse("profile-mailmap-add"), data={ "from_email": f"{name}1@example.org", "display_name": "Display Name 1", }, status_code=200, ) check_api_post_response( api_client, reverse("profile-mailmap-add"), data={ "from_email": f"{name}2@example.org", "display_name": "Display Name 2", }, status_code=200, ) after_add = datetime.datetime.now(tz=datetime.timezone.utc) user_id = None if user == mailmap_admin else str(user.id) mailmaps = list( UserMailmap.objects.filter(user_id=user_id).order_by("from_email").all() ) assert len(mailmaps) == 2, mailmaps assert mailmaps[0].from_email == f"{name}1@example.org", mailmaps assert mailmaps[0].display_name == "Display Name 1", mailmaps assert before_add <= mailmaps[0].last_update_date <= after_add assert mailmaps[1].from_email == f"{name}2@example.org", mailmaps assert mailmaps[1].display_name == "Display Name 2", mailmaps assert before_add <= mailmaps[0].last_update_date <= after_add before_update = datetime.datetime.now(tz=datetime.timezone.utc) check_api_post_response( api_client, reverse("profile-mailmap-update"), data={ "from_email": f"{name}1@example.org", "display_name": "Display Name 1b", }, status_code=200, ) after_update = datetime.datetime.now(tz=datetime.timezone.utc) mailmaps = list( UserMailmap.objects.filter(user_id=user_id).order_by("from_email").all() ) assert len(mailmaps) == 2, mailmaps assert mailmaps[0].from_email == f"{name}1@example.org", mailmaps assert mailmaps[0].display_name == "Display Name 1b", mailmaps assert before_update <= mailmaps[0].last_update_date <= after_update assert mailmaps[1].from_email == f"{name}2@example.org", mailmaps assert mailmaps[1].display_name == "Display Name 2", mailmaps assert before_add <= mailmaps[1].last_update_date <= after_add events = UserMailmapEvent.objects.order_by("timestamp").all() assert len(events) == 3 assert events[0].request_type == "add" assert events[1].request_type == "add" assert events[2].request_type == "update" @pytest.mark.django_db(transaction=True) def test_mailmap_update_from_email_not_found(api_client, mailmap_admin): api_client.force_login(mailmap_admin) check_api_post_response( api_client, reverse("profile-mailmap-update"), - data={"from_email": "invalid@example.org", "display_name": "Display Name",}, + data={ + "from_email": "invalid@example.org", + "display_name": "Display Name", + }, status_code=404, ) NB_MAILMAPS = 20 MM_PER_PAGE = 10 def _create_mailmaps(client): mailmaps = [] for i in range(NB_MAILMAPS): resp = check_http_post_response( client, reverse("profile-mailmap-add"), data={ "from_email": f"user{i:02d}@example.org", "display_name": f"User {i:02d}", }, status_code=200, ) mailmaps.append(json.loads(resp.content)) return mailmaps @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_mailmap_list_datatables_no_parameters(client, mailmap_admin): client.force_login(mailmap_admin) mailmaps = _create_mailmaps(client) url = reverse("profile-mailmap-list-datatables") resp = check_http_get_response(client, url, status_code=200) mailmap_data = json.loads(resp.content) assert mailmap_data["recordsTotal"] == NB_MAILMAPS assert mailmap_data["recordsFiltered"] == NB_MAILMAPS # mailmaps sorted by ascending from_email by default for i in range(10): assert mailmap_data["data"][i]["from_email"] == mailmaps[i]["from_email"] @pytest.mark.django_db(transaction=True, reset_sequences=True) @pytest.mark.parametrize("sort_direction", ["asc", "desc"]) def test_mailmap_list_datatables_ordering(client, mailmap_admin, sort_direction): client.force_login(mailmap_admin) mailmaps = _create_mailmaps(client) mailmaps_sorted = list(sorted(mailmaps, key=lambda d: d["display_name"])) all_display_names = [mm["display_name"] for mm in mailmaps_sorted] if sort_direction == "desc": all_display_names = list(reversed(all_display_names)) for i in range(NB_MAILMAPS // MM_PER_PAGE): url = reverse( "profile-mailmap-list-datatables", query_params={ "draw": i, "length": MM_PER_PAGE, "start": i * MM_PER_PAGE, "order[0][column]": 2, "order[0][dir]": sort_direction, "columns[2][name]": "display_name", }, ) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == i assert data["recordsFiltered"] == NB_MAILMAPS assert data["recordsTotal"] == NB_MAILMAPS assert len(data["data"]) == MM_PER_PAGE display_names = [mm["display_name"] for mm in data["data"]] expected_display_names = all_display_names[ i * MM_PER_PAGE : (i + 1) * MM_PER_PAGE ] assert display_names == expected_display_names @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_mailmap_list_datatables_search(client, mailmap_admin): client.force_login(mailmap_admin) _create_mailmaps(client) search_value = "user1" url = reverse( "profile-mailmap-list-datatables", query_params={ "draw": 1, "length": MM_PER_PAGE, "start": 0, "search[value]": search_value, }, ) resp = check_http_get_response(client, url, status_code=200) data = json.loads(resp.content) assert data["draw"] == 1 assert data["recordsFiltered"] == MM_PER_PAGE assert data["recordsTotal"] == NB_MAILMAPS assert len(data["data"]) == MM_PER_PAGE for mailmap in data["data"]: assert search_value in mailmap["from_email"] def populate_mailmap(): for (verified, activated) in ( (False, False), (False, True), (True, False), (True, True), ): verified_str = "V" if verified else "" activated_str = "A" if activated else "" UserMailmap.objects.create( from_email=f"from_email{verified_str}{activated_str}@example.com", display_name=f"Display Name {verified_str} {activated_str}".strip(), from_email_verified=verified, display_name_activated=activated, ) def call_sync_mailmaps(*args) -> str: out = StringIO() err = StringIO() call_command("sync_mailmaps", *args, stdout=out, stderr=err) out.seek(0) err.seek(0) assert err.read() == "" return out.read() MAILMAP_KNOWN_FULLNAMES = ( "Original Name <from_email@example.com>", "Original Name V <from_emailV@example.com>", "Original Name A <from_emailA@example.com>", "Original Name V A <from_emailVA@example.com>", "Original Name V A 2 <from_emailVA@example.com>", "Original Name V A 3 <from_emailVA@example.com>", ) MAILMAP_KNOWN_PEOPLE = tuple( Person.from_fullname(f.encode()) for f in MAILMAP_KNOWN_FULLNAMES ) def init_stub_storage_db(postgresql): cur = postgresql.cursor() cur.execute( """ CREATE TABLE person ( fullname bytea PRIMARY KEY, name bytea, email bytea, displayname bytea ) """ ) execute_values( cur, "INSERT INTO person (fullname, name, email) VALUES %s", (p.to_dict() for p in MAILMAP_KNOWN_PEOPLE), template="(%(fullname)s, %(name)s, %(email)s)", ) cur.execute("CREATE INDEX ON person (email)") postgresql.commit() cur.close() return postgresql.dsn def get_displaynames(postgresql) -> Dict[str, str]: with postgresql.cursor() as cur: cur.execute( "SELECT fullname, displayname FROM person WHERE displayname IS NOT NULL" ) return {bytes(f).decode("utf-8"): bytes(d).decode("utf-8") for (f, d) in cur} @pytest.mark.django_db(transaction=True) def test_sync_mailmaps_dry_run(postgresql): with transaction.atomic(): populate_mailmap() dsn = init_stub_storage_db(postgresql) out = call_sync_mailmaps(dsn) assert "(dry run)" in out assert "Synced 1 mailmaps to swh.storage database" in out assert get_displaynames(postgresql) == {} assert ( UserMailmap.objects.filter( from_email_verified=True, display_name_activated=True, mailmap_last_processing_date__isnull=False, ).count() == 0 ) @pytest.mark.django_db(transaction=True) def test_sync_mailmaps_perform(postgresql): with transaction.atomic(): populate_mailmap() dsn = init_stub_storage_db(postgresql) out = call_sync_mailmaps("--perform", dsn) assert "(dry run)" not in out assert "Synced 1 mailmaps to swh.storage database" in out expected_displaynames = { "Original Name V A <from_emailVA@example.com>": "Display Name V A", "Original Name V A 2 <from_emailVA@example.com>": "Display Name V A", "Original Name V A 3 <from_emailVA@example.com>": "Display Name V A", } assert get_displaynames(postgresql) == expected_displaynames assert ( UserMailmap.objects.filter( from_email_verified=True, display_name_activated=True, mailmap_last_processing_date__isnull=False, ).count() == 1 ) @pytest.mark.django_db(transaction=True) def test_sync_mailmaps_with_to_email(postgresql): with transaction.atomic(): populate_mailmap() dsn = init_stub_storage_db(postgresql) call_sync_mailmaps("--perform", dsn) expected_displaynames = { "Original Name V A <from_emailVA@example.com>": "Display Name V A", "Original Name V A 2 <from_emailVA@example.com>": "Display Name V A", "Original Name V A 3 <from_emailVA@example.com>": "Display Name V A", } assert get_displaynames(postgresql) == expected_displaynames # Add a non-valid to_email with transaction.atomic(): for mailmap in UserMailmap.objects.filter( from_email_verified=True, display_name_activated=True ): mailmap.to_email = "to_email@example.com" mailmap.save() call_sync_mailmaps("--perform", dsn) assert get_displaynames(postgresql) == expected_displaynames # Verify the relevant to_email with transaction.atomic(): for mailmap in UserMailmap.objects.filter( from_email_verified=True, display_name_activated=True ): mailmap.to_email_verified = True mailmap.save() call_sync_mailmaps("--perform", dsn) new_displayname = "Display Name V A <to_email@example.com>" expected_displaynames = { "Original Name V A <from_emailVA@example.com>": new_displayname, "Original Name V A 2 <from_emailVA@example.com>": new_displayname, "Original Name V A 3 <from_emailVA@example.com>": new_displayname, } assert get_displaynames(postgresql) == expected_displaynames @pytest.mark.django_db(transaction=True) def test_sync_mailmaps_disable(postgresql): """Check that disabling a mailmap only happens once""" with transaction.atomic(): populate_mailmap() dsn = init_stub_storage_db(postgresql) # Do the initial mailmap sync call_sync_mailmaps("--perform", dsn) assert len(get_displaynames(postgresql)) == 3 updated = 0 # Disable a display name with transaction.atomic(): # Cannot use update() because `last_update_date` would not be updated for mailmap in UserMailmap.objects.filter( from_email_verified=True, display_name_activated=True ): mailmap.display_name_activated = False mailmap.save() updated += 1 assert updated == 1 # Sync mailmaps again out = call_sync_mailmaps("--perform", dsn) assert "1 mailmaps to disable" in out assert get_displaynames(postgresql) == {} # Update a displayname by hand with postgresql.cursor() as cur: cur.execute( "UPDATE person SET displayname='Manual Display Name' " "WHERE fullname='Original Name V A <from_emailVA@example.com>'" ) expected_displaynames = { "Original Name V A <from_emailVA@example.com>": "Manual Display Name" } assert get_displaynames(postgresql) == expected_displaynames # Sync mailmaps one last time. No mailmaps should be disabled out = call_sync_mailmaps("--perform", dsn) assert "0 mailmaps to disable" in out assert get_displaynames(postgresql) == expected_displaynames diff --git a/swh/web/tests/auth/test_views.py b/swh/web/tests/auth/test_views.py index cc134809..4cd3edaf 100644 --- a/swh/web/tests/auth/test_views.py +++ b/swh/web/tests/auth/test_views.py @@ -1,306 +1,312 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json from urllib.parse import urljoin, urlparse import uuid import pytest from django.http import QueryDict from swh.auth.keycloak import KeycloakError from swh.web.auth.models import OIDCUserOfflineTokens from swh.web.auth.utils import OIDC_SWH_WEB_CLIENT_ID, decrypt_data from swh.web.common.utils import reverse from swh.web.config import get_config from swh.web.tests.django_asserts import assert_contains from swh.web.tests.utils import ( check_html_get_response, check_http_get_response, check_http_post_response, ) from swh.web.urls import _default_view as homepage_view def _check_oidc_login_code_flow_data( request, response, keycloak_oidc, redirect_uri, scope="openid" ): parsed_url = urlparse(response["location"]) authorization_url = keycloak_oidc.well_known()["authorization_endpoint"] query_dict = QueryDict(parsed_url.query) # check redirect url is valid assert urljoin(response["location"], parsed_url.path) == authorization_url assert "client_id" in query_dict assert query_dict["client_id"] == OIDC_SWH_WEB_CLIENT_ID assert "response_type" in query_dict assert query_dict["response_type"] == "code" assert "redirect_uri" in query_dict assert query_dict["redirect_uri"] == redirect_uri assert "code_challenge_method" in query_dict assert query_dict["code_challenge_method"] == "S256" assert "scope" in query_dict assert query_dict["scope"] == scope assert "state" in query_dict assert "code_challenge" in query_dict # check a login_data has been registered in user session assert "login_data" in request.session login_data = request.session["login_data"] assert "code_verifier" in login_data assert "state" in login_data assert "redirect_uri" in login_data assert login_data["redirect_uri"] == query_dict["redirect_uri"] return login_data def test_view_rendering_when_user_not_set_in_request(request_factory): request = request_factory.get("/") # Django RequestFactory do not set any user by default assert not hasattr(request, "user") response = homepage_view(request) assert response.status_code == 200 def test_oidc_generate_bearer_token_anonymous_user(client): """ Anonymous user should be refused access with forbidden response. """ url = reverse("oidc-generate-bearer-token") check_http_get_response(client, url, status_code=403) def _generate_and_test_bearer_token(client, kc_oidc_mock): # user authenticates client.login( code="code", code_verifier="code-verifier", redirect_uri="redirect-uri" ) # user initiates bearer token generation flow url = reverse("oidc-generate-bearer-token") response = check_http_get_response(client, url, status_code=302) request = response.wsgi_request redirect_uri = reverse("oidc-generate-bearer-token-complete", request=request) # check login data and redirection to Keycloak is valid login_data = _check_oidc_login_code_flow_data( request, response, kc_oidc_mock, redirect_uri=redirect_uri, scope="openid offline_access", ) # once a user has identified himself in Keycloak, he is # redirected to the 'oidc-generate-bearer-token-complete' view # to get and save bearer token # generate authorization code / session state in the same # manner as Keycloak code = f"{str(uuid.uuid4())}.{str(uuid.uuid4())}.{str(uuid.uuid4())}" session_state = str(uuid.uuid4()) token_complete_url = reverse( "oidc-generate-bearer-token-complete", query_params={ "code": code, "state": login_data["state"], "session_state": session_state, }, ) nb_tokens = len(OIDCUserOfflineTokens.objects.all()) response = check_http_get_response(client, token_complete_url, status_code=302) request = response.wsgi_request # check token has been generated and saved encrypted to database assert len(OIDCUserOfflineTokens.objects.all()) == nb_tokens + 1 encrypted_token = OIDCUserOfflineTokens.objects.last().offline_token.tobytes() secret = get_config()["secret_key"].encode() salt = request.user.sub.encode() decrypted_token = decrypt_data(encrypted_token, secret, salt) oidc_profile = kc_oidc_mock.authorization_code(code=code, redirect_uri=redirect_uri) assert decrypted_token.decode("ascii") == oidc_profile["refresh_token"] # should redirect to tokens management Web UI assert response["location"] == reverse("oidc-profile") + "#tokens" return decrypted_token @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_oidc_generate_bearer_token_authenticated_user_success(client, keycloak_oidc): """ Authenticated user should be able to generate a bearer token using OIDC Authorization Code Flow. """ _generate_and_test_bearer_token(client, keycloak_oidc) def test_oidc_list_bearer_tokens_anonymous_user(client): """ Anonymous user should be refused access with forbidden response. """ url = reverse( "oidc-list-bearer-tokens", query_params={"draw": 1, "start": 0, "length": 10} ) check_http_get_response(client, url, status_code=403) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_oidc_list_bearer_tokens(client, keycloak_oidc): """ User with correct credentials should be allowed to list his tokens. """ nb_tokens = 3 for _ in range(nb_tokens): _generate_and_test_bearer_token(client, keycloak_oidc) url = reverse( "oidc-list-bearer-tokens", query_params={"draw": 1, "start": 0, "length": 10} ) response = check_http_get_response(client, url, status_code=200) tokens_data = list(reversed(json.loads(response.content.decode("utf-8"))["data"])) for oidc_token in OIDCUserOfflineTokens.objects.all(): assert ( oidc_token.creation_date.isoformat() == tokens_data[oidc_token.id - 1]["creation_date"] ) def test_oidc_get_bearer_token_anonymous_user(client): """ Anonymous user should be refused access with forbidden response. """ url = reverse("oidc-get-bearer-token") check_http_post_response(client, url, status_code=403) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_oidc_get_bearer_token(client, keycloak_oidc): """ User with correct credentials should be allowed to display a token. """ nb_tokens = 3 for i in range(nb_tokens): token = _generate_and_test_bearer_token(client, keycloak_oidc) url = reverse("oidc-get-bearer-token") response = check_http_post_response( client, url, status_code=200, data={"token_id": i + 1}, content_type="text/plain", ) assert response.content == token @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_oidc_get_bearer_token_expired_token(client, keycloak_oidc): """ User with correct credentials should be allowed to display a token. """ _generate_and_test_bearer_token(client, keycloak_oidc) for kc_err_msg in ("Offline session not active", "Offline user session not found"): kc_error_dict = { "error": "invalid_grant", "error_description": kc_err_msg, } keycloak_oidc.refresh_token.side_effect = KeycloakError( error_message=json.dumps(kc_error_dict).encode(), response_code=400 ) url = reverse("oidc-get-bearer-token") response = check_http_post_response( client, url, status_code=400, data={"token_id": 1}, content_type="text/plain", ) assert ( response.content == b"Bearer token has expired, please generate a new one." ) def test_oidc_revoke_bearer_tokens_anonymous_user(client): """ Anonymous user should be refused access with forbidden response. """ url = reverse("oidc-revoke-bearer-tokens") check_http_post_response(client, url, status_code=403) @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_oidc_revoke_bearer_tokens(client, keycloak_oidc): """ User with correct credentials should be allowed to revoke tokens. """ nb_tokens = 3 for _ in range(nb_tokens): _generate_and_test_bearer_token(client, keycloak_oidc) url = reverse("oidc-revoke-bearer-tokens") check_http_post_response( - client, url, status_code=200, data={"token_ids": [1]}, + client, + url, + status_code=200, + data={"token_ids": [1]}, ) assert len(OIDCUserOfflineTokens.objects.all()) == 2 check_http_post_response( - client, url, status_code=200, data={"token_ids": [2, 3]}, + client, + url, + status_code=200, + data={"token_ids": [2, 3]}, ) assert len(OIDCUserOfflineTokens.objects.all()) == 0 def test_oidc_profile_view_anonymous_user(client): """ Non authenticated users should be redirected to login page when requesting profile view. """ url = reverse("oidc-profile") login_url = reverse("oidc-login", query_params={"next_path": url}) resp = check_http_get_response(client, url, status_code=302) assert resp["location"] == login_url @pytest.mark.django_db(transaction=True, reset_sequences=True) def test_oidc_profile_view(client, keycloak_oidc): """ Authenticated users should be able to request the profile page and link to Keycloak account UI should be present. """ url = reverse("oidc-profile") kc_config = get_config()["keycloak"] client_permissions = ["perm1", "perm2"] keycloak_oidc.client_permissions = client_permissions client.login(code="", code_verifier="", redirect_uri="") resp = check_html_get_response( client, url, status_code=200, template_used="auth/profile.html" ) user = resp.wsgi_request.user kc_account_url = ( f"{kc_config['server_url']}realms/{kc_config['realm_name']}/account/" ) assert_contains(resp, kc_account_url) assert_contains(resp, user.username) assert_contains(resp, user.first_name) assert_contains(resp, user.last_name) assert_contains(resp, user.email) for perm in client_permissions: assert_contains(resp, perm) diff --git a/swh/web/tests/browse/test_snapshot_context.py b/swh/web/tests/browse/test_snapshot_context.py index 56c14a0d..6d830ac6 100644 --- a/swh/web/tests/browse/test_snapshot_context.py +++ b/swh/web/tests/browse/test_snapshot_context.py @@ -1,507 +1,524 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random from swh.model.hashutil import hash_to_bytes from swh.model.model import ObjectType as ModelObjectType from swh.model.model import Release, Snapshot, SnapshotBranch, TargetType from swh.model.swhids import ObjectType from swh.web.browse.snapshot_context import ( _get_release, get_origin_visit_snapshot, get_snapshot_content, get_snapshot_context, ) from swh.web.browse.utils import gen_revision_url from swh.web.common.identifiers import gen_swhid from swh.web.common.origin_visits import get_origin_visit, get_origin_visits from swh.web.common.typing import ( SnapshotBranchInfo, SnapshotContext, SnapshotReleaseInfo, ) from swh.web.common.utils import format_utc_iso_date, reverse def test_get_origin_visit_snapshot_simple(archive_data, origin_with_multiple_visits): visits = archive_data.origin_visit_get(origin_with_multiple_visits["url"]) for visit in visits: snapshot = archive_data.snapshot_get(visit["snapshot"]) branches = [] releases = [] def _process_branch_data(branch, branch_data, alias=False): if branch_data["target_type"] == "revision": rev_data = archive_data.revision_get(branch_data["target"]) branches.append( SnapshotBranchInfo( name=branch, alias=alias, revision=branch_data["target"], directory=rev_data["directory"], date=format_utc_iso_date(rev_data["date"]), message=rev_data["message"], url=None, ) ) elif branch_data["target_type"] == "release": rel_data = archive_data.release_get(branch_data["target"]) rev_data = archive_data.revision_get(rel_data["target"]) releases.append( SnapshotReleaseInfo( name=rel_data["name"], alias=alias, branch_name=branch, date=format_utc_iso_date(rel_data["date"]), id=rel_data["id"], message=rel_data["message"], target_type=rel_data["target_type"], target=rel_data["target"], directory=rev_data["directory"], url=None, ) ) aliases = {} for branch in sorted(snapshot["branches"].keys()): branch_data = snapshot["branches"][branch] if branch_data["target_type"] == "alias": target_data = snapshot["branches"][branch_data["target"]] aliases[branch] = target_data _process_branch_data(branch, target_data, alias=True) else: _process_branch_data(branch, branch_data) assert branches and releases, "Incomplete test data." origin_visit_branches = get_origin_visit_snapshot( origin_with_multiple_visits, visit_id=visit["visit"] ) assert origin_visit_branches == (branches, releases, aliases) def test_get_snapshot_context_no_origin(archive_data, snapshot): for browse_context, kwargs in ( ("content", {"snapshot_id": snapshot, "path": "/some/path"}), ("directory", {"snapshot_id": snapshot}), ("log", {"snapshot_id": snapshot}), ): url_args = {"snapshot_id": snapshot} query_params = dict(kwargs) query_params.pop("snapshot_id") snapshot_context = get_snapshot_context(**kwargs, browse_context=browse_context) branches, releases, _ = get_snapshot_content(snapshot) releases = list(reversed(releases)) revision_id = None root_directory = None for branch in branches: if branch["name"] == "HEAD": revision_id = branch["revision"] root_directory = branch["directory"] branch["url"] = reverse( f"browse-snapshot-{browse_context}", url_args=url_args, query_params={"branch": branch["name"], **query_params}, ) for release in releases: release["url"] = reverse( f"browse-snapshot-{browse_context}", url_args=url_args, query_params={"release": release["name"], **query_params}, ) branches_url = reverse("browse-snapshot-branches", url_args=url_args) releases_url = reverse("browse-snapshot-releases", url_args=url_args) directory_url = reverse("browse-snapshot-directory", url_args=url_args) is_empty = not branches and not releases snapshot_swhid = gen_swhid(ObjectType.SNAPSHOT, snapshot) snapshot_sizes = archive_data.snapshot_count_branches(snapshot) expected = SnapshotContext( branch="HEAD", branch_alias=True, branches=branches, branches_url=branches_url, is_empty=is_empty, origin_info=None, origin_visits_url=None, release=None, release_alias=False, release_id=None, query_params=query_params, releases=releases, releases_url=releases_url, revision_id=revision_id, revision_info=_get_revision_info(archive_data, revision_id), root_directory=root_directory, snapshot_id=snapshot, snapshot_sizes=snapshot_sizes, snapshot_swhid=snapshot_swhid, url_args=url_args, visit_info=None, directory_url=directory_url, ) if revision_id: expected["revision_info"]["revision_url"] = gen_revision_url( revision_id, snapshot_context ) assert snapshot_context == expected _check_branch_release_revision_parameters( archive_data, expected, browse_context, kwargs, branches, releases ) def test_get_snapshot_context_with_origin(archive_data, origin_with_multiple_visits): origin_visits = get_origin_visits(origin_with_multiple_visits) timestamp = format_utc_iso_date(origin_visits[0]["date"], "%Y-%m-%dT%H:%M:%SZ") visit_id = origin_visits[1]["visit"] origin_url = origin_with_multiple_visits["url"] for browse_context, kwargs in ( ("content", {"origin_url": origin_url, "path": "/some/path"}), ("directory", {"origin_url": origin_url}), ("log", {"origin_url": origin_url}), - ("directory", {"origin_url": origin_url, "timestamp": timestamp,},), - ("directory", {"origin_url": origin_url, "visit_id": visit_id,},), + ( + "directory", + { + "origin_url": origin_url, + "timestamp": timestamp, + }, + ), + ( + "directory", + { + "origin_url": origin_url, + "visit_id": visit_id, + }, + ), ): visit_id = kwargs["visit_id"] if "visit_id" in kwargs else None visit_ts = kwargs["timestamp"] if "timestamp" in kwargs else None visit_info = get_origin_visit( {"url": kwargs["origin_url"]}, visit_ts=visit_ts, visit_id=visit_id ) snapshot = visit_info["snapshot"] snapshot_context = get_snapshot_context(**kwargs, browse_context=browse_context) query_params = dict(kwargs) branches, releases, _ = get_snapshot_content(snapshot) releases = list(reversed(releases)) revision_id = None root_directory = None for branch in branches: if branch["name"] == "HEAD": revision_id = branch["revision"] root_directory = branch["directory"] branch["url"] = reverse( f"browse-origin-{browse_context}", query_params={"branch": branch["name"], **query_params}, ) for release in releases: release["url"] = reverse( f"browse-origin-{browse_context}", query_params={"release": release["name"], **query_params}, ) query_params.pop("path", None) branches_url = reverse("browse-origin-branches", query_params=query_params) releases_url = reverse("browse-origin-releases", query_params=query_params) origin_visits_url = reverse( "browse-origin-visits", query_params={"origin_url": kwargs["origin_url"]} ) is_empty = not branches and not releases snapshot_swhid = gen_swhid(ObjectType.SNAPSHOT, snapshot) snapshot_sizes = archive_data.snapshot_count_branches(snapshot) visit_info["url"] = directory_url = reverse( "browse-origin-directory", query_params=query_params ) visit_info["formatted_date"] = format_utc_iso_date(visit_info["date"]) if "path" in kwargs: query_params["path"] = kwargs["path"] expected = SnapshotContext( branch="HEAD", branch_alias=True, branches=branches, branches_url=branches_url, is_empty=is_empty, origin_info={"url": origin_url}, origin_visits_url=origin_visits_url, release=None, release_alias=False, release_id=None, query_params=query_params, releases=releases, releases_url=releases_url, revision_id=revision_id, revision_info=_get_revision_info(archive_data, revision_id), root_directory=root_directory, snapshot_id=snapshot, snapshot_sizes=snapshot_sizes, snapshot_swhid=snapshot_swhid, url_args={}, visit_info=visit_info, directory_url=directory_url, ) if revision_id: expected["revision_info"]["revision_url"] = gen_revision_url( revision_id, snapshot_context ) assert snapshot_context == expected _check_branch_release_revision_parameters( archive_data, expected, browse_context, kwargs, branches, releases ) def _check_branch_release_revision_parameters( - archive_data, base_expected_context, browse_context, kwargs, branches, releases, + archive_data, + base_expected_context, + browse_context, + kwargs, + branches, + releases, ): branch = random.choice(branches) snapshot_context = get_snapshot_context( **kwargs, browse_context=browse_context, branch_name=branch["name"] ) url_args = dict(kwargs) url_args.pop("path", None) url_args.pop("timestamp", None) url_args.pop("visit_id", None) url_args.pop("origin_url", None) query_params = dict(kwargs) query_params.pop("snapshot_id", None) expected_branch = dict(base_expected_context) expected_branch["branch"] = branch["name"] expected_branch["branch_alias"] = branch["alias"] expected_branch["revision_id"] = branch["revision"] expected_branch["revision_info"] = _get_revision_info( archive_data, branch["revision"] ) expected_branch["root_directory"] = branch["directory"] expected_branch["query_params"] = {"branch": branch["name"], **query_params} expected_branch["revision_info"]["revision_url"] = gen_revision_url( branch["revision"], expected_branch ) assert snapshot_context == expected_branch if releases: release = random.choice(releases) snapshot_context = get_snapshot_context( **kwargs, browse_context=browse_context, release_name=release["name"] ) expected_release = dict(base_expected_context) expected_release["branch"] = None expected_release["branch_alias"] = False expected_release["release"] = release["name"] expected_release["release_id"] = release["id"] if release["target_type"] == "revision": expected_release["revision_id"] = release["target"] expected_release["revision_info"] = _get_revision_info( archive_data, release["target"] ) expected_release["root_directory"] = release["directory"] expected_release["query_params"] = {"release": release["name"], **query_params} expected_release["revision_info"]["revision_url"] = gen_revision_url( release["target"], expected_release ) assert snapshot_context == expected_release revision_log = archive_data.revision_log(branch["revision"]) revision = revision_log[-1] snapshot_context = get_snapshot_context( **kwargs, browse_context=browse_context, revision_id=revision["id"] ) if "origin_url" in kwargs: view_name = f"browse-origin-{browse_context}" else: view_name = f"browse-snapshot-{browse_context}" kwargs.pop("visit_id", None) revision_browse_url = reverse( view_name, url_args=url_args, query_params={"revision": revision["id"], **query_params}, ) branches.append( SnapshotBranchInfo( name=revision["id"], alias=False, revision=revision["id"], directory=revision["directory"], date=revision["date"], message=revision["message"], url=revision_browse_url, ) ) expected_revision = dict(base_expected_context) expected_revision["branch"] = None expected_revision["branch_alias"] = False expected_revision["branches"] = branches expected_revision["revision_id"] = revision["id"] expected_revision["revision_info"] = _get_revision_info( archive_data, revision["id"] ) expected_revision["root_directory"] = revision["directory"] expected_revision["query_params"] = {"revision": revision["id"], **query_params} expected_revision["revision_info"]["revision_url"] = gen_revision_url( revision["id"], expected_revision ) assert snapshot_context == expected_revision def test_get_release_large_snapshot(archive_data, origin_with_releases): snapshot = archive_data.snapshot_get_latest(origin_with_releases["url"]) release_id = random.choice( [ v["target"] for v in snapshot["branches"].values() if v["target_type"] == "release" ] ) release_data = archive_data.release_get(release_id) # simulate large snapshot processing by providing releases parameter # as an empty list release = _get_release( releases=[], release_name=release_data["name"], snapshot_id=snapshot["id"] ) assert release_data["name"] == release["name"] assert release_data["id"] == release["id"] def _get_revision_info(archive_data, revision_id): revision_info = None if revision_id: revision_info = archive_data.revision_get(revision_id) revision_info["message_header"] = revision_info["message"].split("\n")[0] revision_info["date"] = format_utc_iso_date(revision_info["date"]) revision_info["committer_date"] = format_utc_iso_date( revision_info["committer_date"] ) return revision_info def test_get_snapshot_context_revision_release(archive_data, revision): release_name = "v1.0.0" release = Release( name=release_name.encode(), message=f"release {release_name}".encode(), target=hash_to_bytes(revision), target_type=ModelObjectType.REVISION, synthetic=True, ) archive_data.release_add([release]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target=release_name.encode(), target_type=TargetType.ALIAS ), release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot]) snapshot_no_head = Snapshot( branches={ release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot_no_head]) revision_data = archive_data.revision_get(revision) for params in ( {"snapshot_id": snapshot.id.hex()}, {"snapshot_id": snapshot.id.hex(), "release_name": release_name}, {"snapshot_id": snapshot_no_head.id.hex()}, ): snapshot_context = get_snapshot_context(**params) assert snapshot_context["branches"] == [] assert snapshot_context["releases"] != [] assert snapshot_context["release"] == release_name assert snapshot_context["release_id"] == release.id.hex() assert snapshot_context["revision_id"] == revision assert snapshot_context["root_directory"] == revision_data["directory"] def test_get_snapshot_context_directory_release(archive_data, directory): release_name = "v1.0.0" release = Release( name=release_name.encode(), message=f"release {release_name}".encode(), target=hash_to_bytes(directory), target_type=ModelObjectType.DIRECTORY, synthetic=True, ) archive_data.release_add([release]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target=release_name.encode(), target_type=TargetType.ALIAS ), release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot]) snapshot_no_head = Snapshot( branches={ release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot_no_head]) for params in ( {"snapshot_id": snapshot.id.hex()}, {"snapshot_id": snapshot.id.hex(), "release_name": release_name}, {"snapshot_id": snapshot_no_head.id.hex()}, ): snapshot_context = get_snapshot_context(**params) assert snapshot_context["branches"] == [] assert snapshot_context["releases"] != [] assert snapshot_context["release"] == release_name assert snapshot_context["release_id"] == release.id.hex() assert snapshot_context["revision_id"] is None assert snapshot_context["root_directory"] == directory diff --git a/swh/web/tests/browse/test_utils.py b/swh/web/tests/browse/test_utils.py index db74e6fc..fc899363 100644 --- a/swh/web/tests/browse/test_utils.py +++ b/swh/web/tests/browse/test_utils.py @@ -1,144 +1,147 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import re import pytest from swh.model.model import Content from swh.web.browse.utils import ( gen_link, gen_person_mail_link, gen_revision_link, get_mimetype_and_encoding_for_content, get_readme_to_display, prepare_content_for_display, re_encode_content, ) from swh.web.common.utils import reverse from swh.web.tests.data import get_tests_data def test_get_mimetype_and_encoding_for_content(): text = b"Hello world!" - assert get_mimetype_and_encoding_for_content(text) == ("text/plain", "us-ascii",) + assert get_mimetype_and_encoding_for_content(text) == ( + "text/plain", + "us-ascii", + ) def test_gen_link(): assert ( gen_link("https://www.softwareheritage.org/", "swh") == '<a href="https://www.softwareheritage.org/">swh</a>' ) def test_gen_revision_link(): revision_id = "28a0bc4120d38a394499382ba21d6965a67a3703" revision_url = reverse("browse-revision", url_args={"sha1_git": revision_id}) assert gen_revision_link( revision_id, link_text=None, link_attrs=None ) == '<a href="%s">%s</a>' % (revision_url, revision_id) assert gen_revision_link( revision_id, shorten_id=True, link_attrs=None ) == '<a href="%s">%s</a>' % (revision_url, revision_id[:7]) def test_gen_person_mail_link(): person_full = { "name": "John Doe", "email": "john.doe@swh.org", "fullname": "John Doe <john.doe@swh.org>", } assert gen_person_mail_link(person_full) == '<a href="mailto:%s">%s</a>' % ( person_full["email"], person_full["name"], ) link_text = "Mail" assert gen_person_mail_link( person_full, link_text=link_text ) == '<a href="mailto:%s">%s</a>' % (person_full["email"], link_text) person_partial_email = {"name": None, "email": None, "fullname": "john.doe@swh.org"} assert gen_person_mail_link( person_partial_email ) == '<a href="mailto:%s">%s</a>' % ( person_partial_email["fullname"], person_partial_email["fullname"], ) person_partial = { "name": None, "email": None, "fullname": "John Doe <john.doe@swh.org>", } assert gen_person_mail_link(person_partial) == person_partial["fullname"] person_none = {"name": None, "email": None, "fullname": None} assert gen_person_mail_link(person_none) == "None" @pytest.mark.parametrize( "path, expected_language", [("CMakeLists.txt", "cmake"), ("path/CMakeLists.txt", "cmake")], ) def test_prepare_content_display_language_for_filename(path, expected_language): content_display = prepare_content_for_display( content_data=b"", mime_type="", path=path ) assert content_display["language"] == expected_language def test_re_encode_content_for_shift_jis_encoding(): data = b"/* \x8a\xd6\x98A\x82\xcc\x95\xb6\x8e\x9a\x83R\x81[\x83h\x95\xcf\x8a\xb7 */" mime_type, encoding = get_mimetype_and_encoding_for_content(data) _, encoding, re_encoded_data = re_encode_content(mime_type, encoding, data) assert encoding == "SHIFT_JIS" assert data.decode(encoding) == re_encoded_data.decode("utf-8") assert re_encoded_data.decode("utf-8") == "/* 関連の文字コード変換 */" @pytest.mark.parametrize( "input_,expected_output", [ (b"foo bar", "<p>foo bar</p>"), (b"foo *bar* baz", "<p>foo <em>bar</em> baz</p>"), ( b".. raw:: html\n\n <script>foo</script>", "<script>foo</script>", ), ], ) def test_rst_readme(input_, expected_output): content = Content.from_data(input_) storage = get_tests_data()["storage"] storage.content_add([content]) assert re.search( expected_output, get_readme_to_display({"readme.rst": content.sha1.hex()})[2] ) def test_rst_readme_no_leak(): input_ = b".. include:: /etc/passwd" content = Content.from_data(input_) storage = get_tests_data()["storage"] storage.content_add([content]) assert "root:" not in get_readme_to_display({"readme.rst": content.sha1.hex()})[2] def test_rst_readme_no_xss(): input_ = b".. raw:: html\n\n <script>foo</script>" content = Content.from_data(input_) storage = get_tests_data()["storage"] storage.content_add([content]) assert ( "<script>" not in get_readme_to_display({"readme.rst": content.sha1.hex()})[2] ) diff --git a/swh/web/tests/browse/views/test_content.py b/swh/web/tests/browse/views/test_content.py index 28688d0d..1aebe7a4 100644 --- a/swh/web/tests/browse/views/test_content.py +++ b/swh/web/tests/browse/views/test_content.py @@ -1,1076 +1,1115 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re import pytest from django.utils.html import escape from swh.model.hashutil import hash_to_bytes from swh.model.model import ObjectType as ModelObjectType from swh.model.model import Release, Snapshot, SnapshotBranch, TargetType from swh.model.swhids import ObjectType from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.browse.utils import ( get_mimetype_and_encoding_for_content, prepare_content_for_display, re_encode_content, ) from swh.web.common.exc import NotFoundExc from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import ( format_utc_iso_date, gen_path_info, parse_iso8601_date_to_utc, reverse, ) from swh.web.tests.data import get_content from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.utils import check_html_get_response, check_http_get_response def test_content_view_text(client, archive_data, content_text): sha1_git = content_text["sha1_git"] url = reverse( "browse-content", url_args={"query_string": content_text["sha1"]}, query_params={"path": content_text["path"]}, ) url_raw = reverse( "browse-content-raw", url_args={"query_string": content_text["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) content_display = _process_content_for_display(archive_data, content_text) mimetype = content_display["mimetype"] if mimetype.startswith("text/"): assert_contains(resp, '<code class="%s">' % content_display["language"]) assert_contains(resp, escape(content_display["content_data"])) assert_contains(resp, url_raw) swh_cnt_id = gen_swhid(ObjectType.CONTENT, sha1_git) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) assert_not_contains(resp, "swh-metadata-popover") def test_content_view_no_highlight( client, archive_data, content_application_no_highlight, content_text_no_highlight ): for content_ in (content_application_no_highlight, content_text_no_highlight): content = content_ sha1_git = content["sha1_git"] url = reverse("browse-content", url_args={"query_string": content["sha1"]}) url_raw = reverse( "browse-content-raw", url_args={"query_string": content["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) content_display = _process_content_for_display(archive_data, content) if content["encoding"] != "binary": assert_contains(resp, '<code class="plaintext">') assert_contains(resp, escape(content_display["content_data"])) assert_contains(resp, url_raw) swh_cnt_id = gen_swhid(ObjectType.CONTENT, sha1_git) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) def test_content_view_no_utf8_text(client, archive_data, content_text_non_utf8): sha1_git = content_text_non_utf8["sha1_git"] url = reverse( "browse-content", url_args={"query_string": content_text_non_utf8["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) content_display = _process_content_for_display(archive_data, content_text_non_utf8) swh_cnt_id = gen_swhid(ObjectType.CONTENT, sha1_git) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id_url) assert_contains(resp, escape(content_display["content_data"])) def test_content_view_image(client, archive_data, content_image_type): url = reverse( "browse-content", url_args={"query_string": content_image_type["sha1"]} ) url_raw = reverse( "browse-content-raw", url_args={"query_string": content_image_type["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) content_display = _process_content_for_display(archive_data, content_image_type) mimetype = content_display["mimetype"] content_data = content_display["content_data"] assert_contains(resp, '<img src="data:%s;base64,%s"/>' % (mimetype, content_data)) assert_contains(resp, url_raw) def test_content_view_image_no_rendering( client, archive_data, content_unsupported_image_type_rendering ): url = reverse( "browse-content", url_args={"query_string": content_unsupported_image_type_rendering["sha1"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) mimetype = content_unsupported_image_type_rendering["mimetype"] encoding = content_unsupported_image_type_rendering["encoding"] assert_contains( resp, ( f"Content with mime type {mimetype} and encoding {encoding} " "cannot be displayed." ), ) def test_content_view_text_with_path(client, archive_data, content_text): path = content_text["path"] url = reverse( "browse-content", url_args={"query_string": content_text["sha1"]}, query_params={"path": path}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) assert_contains(resp, '<nav class="bread-crumbs') content_display = _process_content_for_display(archive_data, content_text) mimetype = content_display["mimetype"] if mimetype.startswith("text/"): hljs_language = content_text["hljs_language"] assert_contains(resp, '<code class="%s">' % hljs_language) assert_contains(resp, escape(content_display["content_data"])) split_path = path.split("/") root_dir_sha1 = split_path[0] filename = split_path[-1] path = path.replace(root_dir_sha1 + "/", "").replace(filename, "") swhid_context = { "anchor": gen_swhid(ObjectType.DIRECTORY, root_dir_sha1), "path": f"/{path}{filename}", } swh_cnt_id = gen_swhid( ObjectType.CONTENT, content_text["sha1_git"], metadata=swhid_context ) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) path_info = gen_path_info(path) root_dir_url = reverse("browse-directory", url_args={"sha1_git": root_dir_sha1}) assert_contains(resp, '<li class="swh-path">', count=len(path_info) + 1) assert_contains( resp, '<a href="' + root_dir_url + '">' + root_dir_sha1[:7] + "</a>" ) for p in path_info: dir_url = reverse( "browse-directory", url_args={"sha1_git": root_dir_sha1}, query_params={"path": p["path"]}, ) assert_contains(resp, '<a href="' + dir_url + '">' + p["name"] + "</a>") assert_contains(resp, "<li>" + filename + "</li>") url_raw = reverse( "browse-content-raw", url_args={"query_string": content_text["sha1"]}, query_params={"filename": filename}, ) assert_contains(resp, url_raw) url = reverse( "browse-content", url_args={"query_string": content_text["sha1"]}, query_params={"path": filename}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) assert_not_contains(resp, '<nav class="bread-crumbs') invalid_path = "%s/foo/bar/baz" % root_dir_sha1 url = reverse( "browse-content", url_args={"query_string": content_text["sha1"]}, query_params={"path": invalid_path}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) def test_content_raw_text(client, archive_data, content_text): url = reverse("browse-content-raw", url_args={"query_string": content_text["sha1"]}) resp = check_http_get_response( client, url, status_code=200, content_type="text/plain" ) content_data = archive_data.content_get_data(content_text["sha1"])["data"] assert resp["Content-Type"] == "text/plain" assert resp["Content-disposition"] == ( "filename=%s_%s" % ("sha1", content_text["sha1"]) ) assert resp.content == content_data filename = content_text["path"].split("/")[-1] url = reverse( "browse-content-raw", url_args={"query_string": content_text["sha1"]}, query_params={"filename": filename}, ) resp = check_http_get_response( client, url, status_code=200, content_type="text/plain" ) assert resp["Content-Type"] == "text/plain" assert resp["Content-disposition"] == "filename=%s" % filename assert resp.content == content_data def test_content_raw_no_utf8_text(client, content_text_non_utf8): url = reverse( "browse-content-raw", url_args={"query_string": content_text_non_utf8["sha1"]} ) resp = check_http_get_response( client, url, status_code=200, content_type="text/plain" ) _, encoding = get_mimetype_and_encoding_for_content(resp.content) assert encoding == content_text_non_utf8["encoding"] def test_content_raw_bin(client, archive_data, content_image_type): url = reverse( "browse-content-raw", url_args={"query_string": content_image_type["sha1"]} ) resp = check_http_get_response( client, url, status_code=200, content_type="application/octet-stream" ) filename = content_image_type["path"].split("/")[-1] content_data = archive_data.content_get_data(content_image_type["sha1"])["data"] assert resp["Content-Type"] == "application/octet-stream" assert resp["Content-disposition"] == "attachment; filename=%s_%s" % ( "sha1", content_image_type["sha1"], ) assert resp.content == content_data url = reverse( "browse-content-raw", url_args={"query_string": content_image_type["sha1"]}, query_params={"filename": filename}, ) resp = check_http_get_response( client, url, status_code=200, content_type="application/octet-stream" ) assert resp["Content-Type"] == "application/octet-stream" assert resp["Content-disposition"] == "attachment; filename=%s" % filename assert resp.content == content_data @pytest.mark.django_db @pytest.mark.parametrize("staff_user_logged_in", [False, True]) def test_content_request_errors( client, staff_user, invalid_sha1, unknown_content, staff_user_logged_in ): if staff_user_logged_in: client.force_login(staff_user) url = reverse("browse-content", url_args={"query_string": invalid_sha1}) check_html_get_response(client, url, status_code=400, template_used="error.html") url = reverse("browse-content", url_args={"query_string": unknown_content["sha1"]}) check_html_get_response( client, url, status_code=404, template_used="browse/content.html" ) def test_content_bytes_missing(client, archive_data, mocker, content): mock_archive = mocker.patch("swh.web.browse.utils.archive") content_data = archive_data.content_get(content["sha1"]) mock_archive.lookup_content.return_value = content_data mock_archive.lookup_content_filetype.side_effect = Exception() mock_archive.lookup_content_raw.side_effect = NotFoundExc( "Content bytes not available!" ) url = reverse("browse-content", url_args={"query_string": content["sha1"]}) check_html_get_response( client, url, status_code=404, template_used="browse/content.html" ) def test_content_too_large(client, mocker): mock_request_content = mocker.patch("swh.web.browse.views.content.request_content") stub_content_too_large_data = { "checksums": { "sha1": "8624bcdae55baeef00cd11d5dfcfa60f68710a02", "sha1_git": "94a9ed024d3859793618152ea559a168bbcbb5e2", "sha256": ( "8ceb4b9ee5adedde47b31e975c1d90c73ad27b6b16" "5a1dcd80c7c545eb65b903" ), "blake2s256": ( "38702b7168c7785bfe748b51b45d9856070ba90" "f9dc6d90f2ea75d4356411ffe" ), }, "length": 30000000, "raw_data": None, "mimetype": "text/plain", "encoding": "us-ascii", "language": "not detected", "licenses": "GPL", "error_code": 200, "error_message": "", "error_description": "", } content_sha1 = stub_content_too_large_data["checksums"]["sha1"] mock_request_content.return_value = stub_content_too_large_data url = reverse("browse-content", url_args={"query_string": content_sha1}) url_raw = reverse("browse-content-raw", url_args={"query_string": content_sha1}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) assert_contains(resp, "Content is too large to be displayed") assert_contains(resp, url_raw) def test_content_uppercase(client, content): url = reverse( "browse-content-uppercase-checksum", url_args={"query_string": content["sha1"].upper()}, ) resp = check_html_get_response(client, url, status_code=302) redirect_url = reverse("browse-content", url_args={"query_string": content["sha1"]}) assert resp["location"] == redirect_url def test_content_utf8_detected_as_binary_display( client, archive_data, content_utf8_detected_as_binary ): url = reverse( "browse-content", url_args={"query_string": content_utf8_detected_as_binary["sha1"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) content_display = _process_content_for_display( archive_data, content_utf8_detected_as_binary ) assert_contains(resp, escape(content_display["content_data"])) def test_content_origin_snapshot_branch_browse( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) visit = random.choice(visits) snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(visit["snapshot"]) branches, releases, _ = process_snapshot_branches(snapshot) branch_info = random.choice(branches) directory = archive_data.revision_get(branch_info["revision"])["directory"] directory_content = archive_data.directory_ls(directory) directory_file = random.choice( [e for e in directory_content if e["type"] == "file"] ) url = reverse( "browse-content", url_args={"query_string": directory_file["checksums"]["sha1"]}, query_params={ "origin_url": origin_with_multiple_visits["url"], "snapshot": snapshot["id"], "branch": branch_info["name"], "path": directory_file["name"], }, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) _check_origin_snapshot_related_html( resp, origin_with_multiple_visits, snapshot, snapshot_sizes, branches, releases ) assert_contains(resp, directory_file["name"]) assert_contains(resp, f"Branch: <strong>{branch_info['name']}</strong>") cnt_swhid = gen_swhid( ObjectType.CONTENT, directory_file["checksums"]["sha1_git"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.REVISION, branch_info["revision"]), "path": f"/{directory_file['name']}", }, ) assert_contains(resp, cnt_swhid) dir_swhid = gen_swhid( ObjectType.DIRECTORY, directory, metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.REVISION, branch_info["revision"]), }, ) assert_contains(resp, dir_swhid) rev_swhid = gen_swhid( ObjectType.REVISION, branch_info["revision"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), }, ) assert_contains(resp, rev_swhid) snp_swhid = gen_swhid( - ObjectType.SNAPSHOT, snapshot["id"], metadata={"origin": origin_url,}, + ObjectType.SNAPSHOT, + snapshot["id"], + metadata={ + "origin": origin_url, + }, ) assert_contains(resp, snp_swhid) def test_content_origin_snapshot_release_browse( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) visit = random.choice(visits) snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(visit["snapshot"]) branches, releases, _ = process_snapshot_branches(snapshot) release_info = random.choice(releases) directory_content = archive_data.directory_ls(release_info["directory"]) directory_file = random.choice( [e for e in directory_content if e["type"] == "file"] ) url = reverse( "browse-content", url_args={"query_string": directory_file["checksums"]["sha1"]}, query_params={ "origin_url": origin_url, "snapshot": snapshot["id"], "release": release_info["name"], "path": directory_file["name"], }, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) _check_origin_snapshot_related_html( resp, origin_with_multiple_visits, snapshot, snapshot_sizes, branches, releases ) assert_contains(resp, directory_file["name"]) assert_contains(resp, f"Release: <strong>{release_info['name']}</strong>") cnt_swhid = gen_swhid( ObjectType.CONTENT, directory_file["checksums"]["sha1_git"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.RELEASE, release_info["id"]), "path": f"/{directory_file['name']}", }, ) assert_contains(resp, cnt_swhid) dir_swhid = gen_swhid( ObjectType.DIRECTORY, release_info["directory"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.RELEASE, release_info["id"]), }, ) assert_contains(resp, dir_swhid) rev_swhid = gen_swhid( ObjectType.REVISION, release_info["target"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), }, ) assert_contains(resp, rev_swhid) rel_swhid = gen_swhid( ObjectType.RELEASE, release_info["id"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), }, ) assert_contains(resp, rel_swhid) snp_swhid = gen_swhid( - ObjectType.SNAPSHOT, snapshot["id"], metadata={"origin": origin_url,}, + ObjectType.SNAPSHOT, + snapshot["id"], + metadata={ + "origin": origin_url, + }, ) assert_contains(resp, snp_swhid) def _check_origin_snapshot_related_html( resp, origin, snapshot, snapshot_sizes, branches, releases ): browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin["url"]} ) assert_contains(resp, f'href="{browse_origin_url}"') origin_branches_url = reverse( "browse-origin-branches", query_params={"origin_url": origin["url"], "snapshot": snapshot["id"]}, ) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse( "browse-origin-releases", query_params={"origin_url": origin["url"], "snapshot": snapshot["id"]}, ) assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '<li class="swh-branch">', count=len(branches)) assert_contains(resp, '<li class="swh-release">', count=len(releases)) def _process_content_for_display(archive_data, content): content_data = archive_data.content_get_data(content["sha1"]) mime_type, encoding = get_mimetype_and_encoding_for_content(content_data["data"]) mime_type, encoding, content_data = re_encode_content( mime_type, encoding, content_data["data"] ) content_display = prepare_content_for_display( content_data, mime_type, content["path"] ) assert type(content_display["content_data"]) == str return content_display def test_content_dispaly_empty_query_string_missing_path(client): - url = reverse("browse-content", query_params={"origin_url": "http://example.com"},) + url = reverse( + "browse-content", + query_params={"origin_url": "http://example.com"}, + ) resp = check_html_get_response( client, url, status_code=400, template_used="error.html" ) assert_contains(resp, "The path query parameter must be provided.", status_code=400) def test_content_dispaly_empty_query_string_and_snapshot_origin(client): - url = reverse("browse-content", query_params={"path": "test.txt"},) - resp = check_html_get_response(client, url, status_code=400,) + url = reverse( + "browse-content", + query_params={"path": "test.txt"}, + ) + resp = check_html_get_response( + client, + url, + status_code=400, + ) assert_contains( resp, "The origin_url or snapshot query parameters must be provided.", status_code=400, ) def test_content_dispaly_empty_query_string_with_origin( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] snapshot = archive_data.snapshot_get_latest(origin_url) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) dir_content = archive_data.directory_ls(head_rev["directory"]) dir_files = [e for e in dir_content if e["type"] == "file"] dir_file = random.choice(dir_files) url = reverse( "browse-content", - query_params={"origin_url": origin_url, "path": dir_file["name"],}, + query_params={ + "origin_url": origin_url, + "path": dir_file["name"], + }, ) - resp = check_html_get_response(client, url, status_code=302,) + resp = check_html_get_response( + client, + url, + status_code=302, + ) redict_url = reverse( "browse-content", url_args={"query_string": f"sha1_git:{dir_file['checksums']['sha1_git']}"}, - query_params={"origin_url": origin_url, "path": dir_file["name"],}, + query_params={ + "origin_url": origin_url, + "path": dir_file["name"], + }, ) assert resp.url == redict_url def test_content_dispaly_empty_query_string_with_snapshot( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] snapshot = archive_data.snapshot_get_latest(origin_url) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) dir_content = archive_data.directory_ls(head_rev["directory"]) dir_files = [e for e in dir_content if e["type"] == "file"] dir_file = random.choice(dir_files) url = reverse( "browse-content", - query_params={"snapshot": snapshot["id"], "path": dir_file["name"],}, + query_params={ + "snapshot": snapshot["id"], + "path": dir_file["name"], + }, ) - resp = check_html_get_response(client, url, status_code=302,) + resp = check_html_get_response( + client, + url, + status_code=302, + ) redict_url = reverse( "browse-content", url_args={"query_string": f"sha1_git:{dir_file['checksums']['sha1_git']}"}, - query_params={"snapshot": snapshot["id"], "path": dir_file["name"],}, + query_params={ + "snapshot": snapshot["id"], + "path": dir_file["name"], + }, ) assert resp.url == redict_url def test_browse_origin_content_no_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [] mock_archive = mocker.patch("swh.web.common.origin_visits.archive") mock_archive.lookup_origin_visit_latest.return_value = None url = reverse( - "browse-content", query_params={"origin_url": origin["url"], "path": "foo"}, + "browse-content", + query_params={"origin_url": origin["url"], "path": "foo"}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains(resp, "No valid visit", status_code=404) assert not mock_get_origin_visits.called def test_browse_origin_content_unknown_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [{"visit": 1}] url = reverse( "browse-content", query_params={"origin_url": origin["url"], "path": "foo", "visit_id": 2}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search("Resource not found", resp.content.decode("utf-8")) def test_browse_origin_content_not_found(client, origin): url = reverse( "browse-content", query_params={"origin_url": origin["url"], "path": "/invalid/file/path"}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search("Resource not found", resp.content.decode("utf-8")) def test_browse_content_invalid_origin(client): url = reverse( "browse-content", query_params={ "origin_url": "http://invalid-origin", "path": "/invalid/file/path", }, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search("Resource not found", resp.content.decode("utf-8")) def test_origin_content_view( client, archive_data, swh_scheduler, origin_with_multiple_visits ): origin_visits = archive_data.origin_visit_get(origin_with_multiple_visits["url"]) def _get_archive_data(visit_idx): snapshot = archive_data.snapshot_get(origin_visits[visit_idx]["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) dir_content = archive_data.directory_ls(head_rev["directory"]) dir_files = [e for e in dir_content if e["type"] == "file"] dir_file = random.choice(dir_files) branches, releases, _ = process_snapshot_branches(snapshot) return { "branches": branches, "releases": releases, "root_dir_sha1": head_rev["directory"], "content": get_content(dir_file["checksums"]["sha1"]), "visit": origin_visits[visit_idx], "snapshot_sizes": archive_data.snapshot_count_branches(snapshot["id"]), } tdata = _get_archive_data(-1) _origin_content_view_test_helper( client, archive_data, origin_with_multiple_visits, origin_visits[-1], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], ) _origin_content_view_test_helper( client, archive_data, origin_with_multiple_visits, origin_visits[-1], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], timestamp=tdata["visit"]["date"], ) _origin_content_view_test_helper( client, archive_data, origin_with_multiple_visits, origin_visits[-1], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], snapshot_id=tdata["visit"]["snapshot"], ) tdata = _get_archive_data(0) _origin_content_view_test_helper( client, archive_data, origin_with_multiple_visits, origin_visits[0], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], visit_id=tdata["visit"]["visit"], ) _origin_content_view_test_helper( client, archive_data, origin_with_multiple_visits, origin_visits[0], tdata["snapshot_sizes"], tdata["branches"], tdata["releases"], tdata["root_dir_sha1"], tdata["content"], snapshot_id=tdata["visit"]["snapshot"], ) def _origin_content_view_test_helper( client, archive_data, origin_info, origin_visit, snapshot_sizes, origin_branches, origin_releases, root_dir_sha1, content, visit_id=None, timestamp=None, snapshot_id=None, ): content_path = "/".join(content["path"].split("/")[1:]) if not visit_id and not snapshot_id: visit_id = origin_visit["visit"] query_params = {"origin_url": origin_info["url"], "path": content_path} if timestamp: query_params["timestamp"] = timestamp if visit_id: query_params["visit_id"] = visit_id elif snapshot_id: query_params["snapshot"] = snapshot_id url = reverse( "browse-content", url_args={"query_string": f"sha1_git:{content['sha1_git']}"}, query_params=query_params, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) assert type(content["data"]) == str assert_contains(resp, '<code class="%s">' % content["hljs_language"]) assert_contains(resp, escape(content["data"])) split_path = content_path.split("/") filename = split_path[-1] path = content_path.replace(filename, "")[:-1] path_info = gen_path_info(path) del query_params["path"] if timestamp: query_params["timestamp"] = format_utc_iso_date( parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ" ) root_dir_url = reverse( "browse-directory", url_args={"sha1_git": root_dir_sha1}, query_params=query_params, ) assert_contains(resp, '<li class="swh-path">', count=len(path_info) + 1) assert_contains(resp, '<a href="%s">%s</a>' % (root_dir_url, root_dir_sha1[:7])) for p in path_info: query_params["path"] = p["path"] dir_url = reverse("browse-origin-directory", query_params=query_params) assert_contains(resp, '<a href="%s">%s</a>' % (dir_url, p["name"])) assert_contains(resp, "<li>%s</li>" % filename) query_string = "sha1_git:" + content["sha1_git"] url_raw = reverse( "browse-content-raw", url_args={"query_string": query_string}, query_params={"filename": filename}, ) assert_contains(resp, url_raw) if "path" in query_params: del query_params["path"] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) assert_contains(resp, f'href="{escape(origin_releases_url)}">') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '<li class="swh-branch">', count=len(origin_branches)) query_params["path"] = content_path for branch in origin_branches: root_dir_branch_url = reverse( "browse-origin-content", query_params={"branch": branch["name"], **query_params}, ) assert_contains(resp, '<a href="%s">' % root_dir_branch_url) assert_contains(resp, '<li class="swh-release">', count=len(origin_releases)) query_params["branch"] = None for release in origin_releases: root_dir_release_url = reverse( "browse-origin-content", query_params={"release": release["name"], **query_params}, ) assert_contains(resp, '<a href="%s">' % root_dir_release_url) url = reverse( "browse-content", url_args={"query_string": query_string}, query_params=query_params, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) swhid_context = { "origin": origin_info["url"], "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.REVISION, head_rev_id), "path": f"/{content_path}", } swh_cnt_id = gen_swhid( ObjectType.CONTENT, content["sha1_git"], metadata=swhid_context ) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) assert_contains(resp, "swh-take-new-snapshot") _check_origin_link(resp, origin_info["url"]) assert_not_contains(resp, "swh-metadata-popover") def _check_origin_link(resp, origin_url): browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin_url} ) assert_contains(resp, f'href="{browse_origin_url}"') @pytest.mark.django_db @pytest.mark.parametrize("staff_user_logged_in", [False, True]) def test_browse_content_snapshot_context_release_directory_target( client, staff_user, archive_data, directory_with_files, staff_user_logged_in ): if staff_user_logged_in: client.force_login(staff_user) release_name = "v1.0.0" release = Release( name=release_name.encode(), message=f"release {release_name}".encode(), target=hash_to_bytes(directory_with_files), target_type=ModelObjectType.DIRECTORY, synthetic=True, ) archive_data.release_add([release]) snapshot = Snapshot( branches={ release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot]) dir_content = archive_data.directory_ls(directory_with_files) file_entry = random.choice( [entry for entry in dir_content if entry["type"] == "file"] ) sha1_git = file_entry["checksums"]["sha1_git"] browse_url = reverse( "browse-content", url_args={"query_string": f"sha1_git:{sha1_git}"}, query_params={ "path": file_entry["name"], "release": release_name, "snapshot": snapshot.id.hex(), }, ) check_html_get_response( client, browse_url, status_code=200, template_used="browse/content.html" ) diff --git a/swh/web/tests/browse/views/test_directory.py b/swh/web/tests/browse/views/test_directory.py index e990b3ee..e1e4dd82 100644 --- a/swh/web/tests/browse/views/test_directory.py +++ b/swh/web/tests/browse/views/test_directory.py @@ -1,549 +1,565 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random from hypothesis import given import pytest from django.utils.html import escape from swh.model.from_disk import DentryPerms from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import Directory, DirectoryEntry from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import ObjectType from swh.storage.utils import now from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import gen_path_info, reverse from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.strategies import new_person, new_swh_date from swh.web.tests.utils import check_html_get_response def test_root_directory_view(client, archive_data, directory): _directory_view_checks(client, directory, archive_data.directory_ls(directory)) def test_sub_directory_view(client, archive_data, directory_with_subdirs): dir_content = archive_data.directory_ls(directory_with_subdirs) subdir = random.choice([e for e in dir_content if e["type"] == "dir"]) subdir_content = archive_data.directory_ls(subdir["target"]) _directory_view_checks( client, directory_with_subdirs, subdir_content, subdir["name"] ) @given(new_person(), new_swh_date()) def test_sub_directory_view_origin_context( client, archive_data, empty_directory, person, date ): origin_url = "test_sub_directory_view_origin_context" subdir = Directory( entries=( DirectoryEntry( name=b"foo", type="dir", target=hash_to_bytes(empty_directory), perms=DentryPerms.directory, ), DirectoryEntry( name=b"bar", type="dir", target=hash_to_bytes(empty_directory), perms=DentryPerms.directory, ), ) ) parentdir = Directory( entries=( DirectoryEntry( - name=b"baz", type="dir", target=subdir.id, perms=DentryPerms.directory, + name=b"baz", + type="dir", + target=subdir.id, + perms=DentryPerms.directory, ), ) ) archive_data.directory_add([subdir, parentdir]) revision = Revision( directory=parentdir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([revision]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( - target="refs/head/master".encode(), target_type=TargetType.ALIAS, + target="refs/head/master".encode(), + target_type=TargetType.ALIAS, ), b"refs/head/master": SnapshotBranch( - target=revision.id, target_type=TargetType.REVISION, + target=revision.id, + target_type=TargetType.REVISION, ), } ) archive_data.snapshot_add([snapshot]) archive_data.origin_add([Origin(url=origin_url)]) date = now() visit = OriginVisit(origin=origin_url, date=date, type="git") visit = archive_data.origin_visit_add([visit])[0] visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=date, status="full", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id)) subdir = dir_content[0] subdir_content = archive_data.directory_ls(subdir["target"]) _directory_view_checks( client, hash_to_hex(parentdir.id), subdir_content, subdir["name"], origin_url, hash_to_hex(snapshot.id), hash_to_hex(revision.id), ) def test_directory_request_errors(client, invalid_sha1, unknown_directory): dir_url = reverse("browse-directory", url_args={"sha1_git": invalid_sha1}) check_html_get_response( client, dir_url, status_code=400, template_used="error.html" ) dir_url = reverse("browse-directory", url_args={"sha1_git": unknown_directory}) check_html_get_response( client, dir_url, status_code=404, template_used="error.html" ) def test_directory_with_invalid_path(client, directory): path = "foo/bar" dir_url = reverse( "browse-directory", url_args={"sha1_git": directory}, query_params={"path": path}, ) resp = check_html_get_response( client, dir_url, status_code=404, template_used="browse/directory.html" ) error_message = ( f"Directory entry with path {path} from root directory {directory} not found" ) assert_contains(resp, error_message, status_code=404) def test_directory_uppercase(client, directory): url = reverse( "browse-directory-uppercase-checksum", url_args={"sha1_git": directory.upper()} ) resp = check_html_get_response(client, url, status_code=302) redirect_url = reverse("browse-directory", url_args={"sha1_git": directory}) assert resp["location"] == redirect_url def test_permalink_box_context(client, tests_data, directory): origin_url = random.choice(tests_data["origins"])["url"] url = reverse( "browse-directory", url_args={"sha1_git": directory}, query_params={"origin_url": origin_url}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, 'id="swhid-context-option-directory"') def test_directory_origin_snapshot_branch_browse( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) visit = random.choice(visits) snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(visit["snapshot"]) branches, releases, _ = process_snapshot_branches(snapshot) branch_info = next( branch for branch in branches if branch["name"] == "refs/heads/master" ) directory = archive_data.revision_get(branch_info["revision"])["directory"] directory_content = archive_data.directory_ls(directory) directory_subdir = random.choice( [e for e in directory_content if e["type"] == "dir"] ) url = reverse( "browse-directory", url_args={"sha1_git": directory}, query_params={ "origin_url": origin_url, "snapshot": snapshot["id"], "branch": branch_info["name"], "path": directory_subdir["name"], }, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) _check_origin_snapshot_related_html( resp, origin_with_multiple_visits, snapshot, snapshot_sizes, branches, releases ) assert_contains(resp, directory_subdir["name"]) assert_contains(resp, f"Branch: <strong>{branch_info['name']}</strong>") dir_swhid = gen_swhid( ObjectType.DIRECTORY, directory_subdir["target"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.REVISION, branch_info["revision"]), "path": "/", }, ) assert_contains(resp, dir_swhid) rev_swhid = gen_swhid( ObjectType.REVISION, branch_info["revision"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), }, ) assert_contains(resp, rev_swhid) snp_swhid = gen_swhid( - ObjectType.SNAPSHOT, snapshot["id"], metadata={"origin": origin_url,}, + ObjectType.SNAPSHOT, + snapshot["id"], + metadata={ + "origin": origin_url, + }, ) assert_contains(resp, snp_swhid) def test_drectory_origin_snapshot_release_browse( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) visit = random.choice(visits) snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(visit["snapshot"]) branches, releases, _ = process_snapshot_branches(snapshot) release_info = random.choice(releases) directory = release_info["directory"] directory_content = archive_data.directory_ls(directory) directory_subdir = random.choice( [e for e in directory_content if e["type"] == "dir"] ) url = reverse( "browse-directory", url_args={"sha1_git": directory}, query_params={ "origin_url": origin_url, "snapshot": snapshot["id"], "release": release_info["name"], "path": directory_subdir["name"], }, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) _check_origin_snapshot_related_html( resp, origin_with_multiple_visits, snapshot, snapshot_sizes, branches, releases ) assert_contains(resp, directory_subdir["name"]) assert_contains(resp, f"Release: <strong>{release_info['name']}</strong>") dir_swhid = gen_swhid( ObjectType.DIRECTORY, directory_subdir["target"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.RELEASE, release_info["id"]), "path": "/", }, ) assert_contains(resp, dir_swhid) rev_swhid = gen_swhid( ObjectType.REVISION, release_info["target"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), }, ) assert_contains(resp, rev_swhid) rel_swhid = gen_swhid( ObjectType.RELEASE, release_info["id"], metadata={ "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), }, ) assert_contains(resp, rel_swhid) snp_swhid = gen_swhid( - ObjectType.SNAPSHOT, snapshot["id"], metadata={"origin": origin_url,}, + ObjectType.SNAPSHOT, + snapshot["id"], + metadata={ + "origin": origin_url, + }, ) assert_contains(resp, snp_swhid) def test_directory_origin_snapshot_revision_browse( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) visit = random.choice(visits) snapshot = archive_data.snapshot_get(visit["snapshot"]) branches, releases, _ = process_snapshot_branches(snapshot) branch_info = next( branch for branch in branches if branch["name"] == "refs/heads/master" ) directory = archive_data.revision_get(branch_info["revision"])["directory"] directory_content = archive_data.directory_ls(directory) directory_subdir = random.choice( [e for e in directory_content if e["type"] == "dir"] ) url = reverse( "browse-directory", url_args={"sha1_git": directory}, query_params={ "origin_url": origin_url, "snapshot": snapshot["id"], "revision": branch_info["revision"], "path": directory_subdir["name"], }, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, f"Revision: <strong>{branch_info['revision']}</strong>") def _check_origin_snapshot_related_html( resp, origin, snapshot, snapshot_sizes, branches, releases ): browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin["url"]} ) assert_contains(resp, f'href="{browse_origin_url}"') origin_branches_url = reverse( "browse-origin-branches", query_params={"origin_url": origin["url"], "snapshot": snapshot["id"]}, ) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse( "browse-origin-releases", query_params={"origin_url": origin["url"], "snapshot": snapshot["id"]}, ) assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '<li class="swh-branch">', count=len(branches)) assert_contains(resp, '<li class="swh-release">', count=len(releases)) def _directory_view_checks( client, root_directory_sha1, directory_entries, path=None, origin_url=None, snapshot_id=None, revision_id=None, ): dirs = [e for e in directory_entries if e["type"] in ("dir", "rev")] files = [e for e in directory_entries if e["type"] == "file"] url_args = {"sha1_git": root_directory_sha1} query_params = {"origin_url": origin_url, "snapshot": snapshot_id} url = reverse( "browse-directory", url_args=url_args, query_params={**query_params, "path": path}, ) root_dir_url = reverse( - "browse-directory", url_args=url_args, query_params=query_params, + "browse-directory", + url_args=url_args, + query_params=query_params, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains( - resp, '<a href="' + root_dir_url + '">' + root_directory_sha1[:7] + "</a>", + resp, + '<a href="' + root_dir_url + '">' + root_directory_sha1[:7] + "</a>", ) assert_contains(resp, '<td class="swh-directory">', count=len(dirs)) assert_contains(resp, '<td class="swh-content">', count=len(files)) for d in dirs: if d["type"] == "rev": dir_url = reverse("browse-revision", url_args={"sha1_git": d["target"]}) else: dir_path = d["name"] if path: dir_path = "%s/%s" % (path, d["name"]) dir_url = reverse( "browse-directory", url_args={"sha1_git": root_directory_sha1}, query_params={**query_params, "path": dir_path}, ) assert_contains(resp, dir_url) for f in files: file_path = "%s/%s" % (root_directory_sha1, f["name"]) if path: file_path = "%s/%s/%s" % (root_directory_sha1, path, f["name"]) query_string = "sha1_git:" + f["target"] file_url = reverse( "browse-content", url_args={"query_string": query_string}, query_params={**query_params, "path": file_path}, ) assert_contains(resp, file_url) path_info = gen_path_info(path) assert_contains(resp, '<li class="swh-path">', count=len(path_info) + 1) assert_contains( resp, '<a href="%s">%s</a>' % (root_dir_url, root_directory_sha1[:7]) ) for p in path_info: dir_url = reverse( "browse-directory", url_args={"sha1_git": root_directory_sha1}, query_params={**query_params, "path": p["path"]}, ) assert_contains(resp, '<a href="%s">%s</a>' % (dir_url, p["name"])) assert_contains(resp, "vault-cook-directory") swh_dir_id = gen_swhid(ObjectType.DIRECTORY, directory_entries[0]["dir_id"]) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) swhid_context = {} if origin_url: swhid_context["origin"] = origin_url if snapshot_id: swhid_context["visit"] = gen_swhid(ObjectType.SNAPSHOT, snapshot_id) if root_directory_sha1 != directory_entries[0]["dir_id"]: swhid_context["anchor"] = gen_swhid(ObjectType.DIRECTORY, root_directory_sha1) if root_directory_sha1 != directory_entries[0]["dir_id"]: swhid_context["anchor"] = gen_swhid(ObjectType.DIRECTORY, root_directory_sha1) if revision_id: swhid_context["anchor"] = gen_swhid(ObjectType.REVISION, revision_id) swhid_context["path"] = f"/{path}/" if path else None swh_dir_id = gen_swhid( ObjectType.DIRECTORY, directory_entries[0]["dir_id"], metadata=swhid_context ) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) assert_not_contains(resp, "swh-metadata-popover") @pytest.mark.django_db @pytest.mark.parametrize("staff_user_logged_in", [False, True]) def test_browse_directory_snapshot_context_release_directory_target( client, staff_user, archive_data, directory_with_subdirs, staff_user_logged_in ): if staff_user_logged_in: client.force_login(staff_user) release_name = "v1.0.0" release = Release( name=release_name.encode(), message=f"release {release_name}".encode(), target=hash_to_bytes(directory_with_subdirs), target_type=ModelObjectType.DIRECTORY, synthetic=True, ) archive_data.release_add([release]) snapshot = Snapshot( branches={ release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot]) dir_content = archive_data.directory_ls(directory_with_subdirs) dir_entry = random.choice( [entry for entry in dir_content if entry["type"] == "dir"] ) browse_url = reverse( "browse-directory", url_args={"sha1_git": directory_with_subdirs}, query_params={ "path": dir_entry["name"], "release": release_name, "snapshot": snapshot.id.hex(), }, ) check_html_get_response( client, browse_url, status_code=200, template_used="browse/directory.html" ) diff --git a/swh/web/tests/browse/views/test_origin.py b/swh/web/tests/browse/views/test_origin.py index a0ab9ec9..c868a1a8 100644 --- a/swh/web/tests/browse/views/test_origin.py +++ b/swh/web/tests/browse/views/test_origin.py @@ -1,888 +1,945 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re from hypothesis import given import pytest from django.utils.html import escape from swh.model.hashutil import hash_to_bytes from swh.model.model import ( OriginVisit, OriginVisitStatus, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import ObjectType from swh.storage.utils import now from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.common.exc import NotFoundExc from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import format_utc_iso_date, parse_iso8601_date_to_utc, reverse from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.strategies import new_origin, new_snapshot, visit_dates from swh.web.tests.utils import check_html_get_response def test_origin_visits_browse(client, archive_data, origin_with_multiple_visits): origin_url = origin_with_multiple_visits["url"] url = reverse("browse-origin-visits", query_params={"origin_url": origin_url}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/origin-visits.html" ) visits = archive_data.origin_visit_get(origin_url) for v in visits: vdate = format_utc_iso_date(v["date"], "%Y-%m-%dT%H:%M:%SZ") browse_dir_url = reverse( "browse-origin-directory", query_params={"origin_url": origin_url, "timestamp": vdate}, ) assert_contains(resp, browse_dir_url) _check_origin_link(resp, origin_url) def test_origin_root_directory_view(client, archive_data, swh_scheduler, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) root_dir_sha1 = head_rev["directory"] dir_content = archive_data.directory_ls(root_dir_sha1) branches, releases, _ = process_snapshot_branches(snapshot) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, snapshot_id=visit["snapshot"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, dir_content, snapshot_id=visit["snapshot"], ) def test_origin_sub_directory_view(client, archive_data, swh_scheduler, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) root_dir_sha1 = head_rev["directory"] subdirs = [ e for e in archive_data.directory_ls(root_dir_sha1) if e["type"] == "dir" ] branches, releases, _ = process_snapshot_branches(snapshot) if len(subdirs) == 0: return subdir = random.choice(subdirs) subdir_content = archive_data.directory_ls(subdir["target"]) subdir_path = subdir["name"] _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, snapshot_id=visit["snapshot"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, visit_id=visit["visit"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, timestamp=visit["date"], ) _origin_directory_view_test_helper( client, archive_data, origin, visit, snapshot_sizes, branches, releases, root_dir_sha1, subdir_content, path=subdir_path, snapshot_id=visit["snapshot"], ) @given( - new_origin(), new_snapshot(min_size=4, max_size=4), visit_dates(), + new_origin(), + new_snapshot(min_size=4, max_size=4), + visit_dates(), ) def test_origin_snapshot_null_branch( - client, archive_data, revisions_list, new_origin, new_snapshot, visit_dates, + client, + archive_data, + revisions_list, + new_origin, + new_snapshot, + visit_dates, ): revisions = revisions_list(size=4) snp_dict = new_snapshot.to_dict() archive_data.origin_add([new_origin]) for i, branch in enumerate(snp_dict["branches"].keys()): if i == 0: snp_dict["branches"][branch] = None else: snp_dict["branches"][branch] = { "target_type": "revision", "target": hash_to_bytes(revisions[i - 1]), } archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_dates[0], + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url} ) check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) @given( - new_origin(), new_snapshot(min_size=4, max_size=4), visit_dates(), + new_origin(), + new_snapshot(min_size=4, max_size=4), + visit_dates(), ) def test_origin_snapshot_invalid_branch( - client, archive_data, revisions_list, new_origin, new_snapshot, visit_dates, + client, + archive_data, + revisions_list, + new_origin, + new_snapshot, + visit_dates, ): revisions = revisions_list(size=4) snp_dict = new_snapshot.to_dict() archive_data.origin_add([new_origin]) for i, branch in enumerate(snp_dict["branches"].keys()): snp_dict["branches"][branch] = { "target_type": "revision", "target": hash_to_bytes(revisions[i]), } archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_dates[0], + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": "invalid_branch"}, ) check_html_get_response(client, url, status_code=404, template_used="error.html") @given(new_origin()) def test_browse_visits_origin_not_found(client, new_origin): url = reverse("browse-origin-visits", query_params={"origin_url": new_origin.url}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains( resp, f"Origin with url {new_origin.url} not found", status_code=404 ) def test_browse_origin_directory_no_visit(client, mocker, origin): mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = [] mock_archive = mocker.patch("swh.web.common.origin_visits.archive") mock_archive.lookup_origin_visit_latest.return_value = None url = reverse("browse-origin-directory", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains(resp, "No valid visit", status_code=404) assert not mock_get_origin_visits.called def test_browse_origin_directory_unknown_visit(client, origin): url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "visit_id": 200}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search("visit.*not found", resp.content.decode("utf-8")) def test_browse_origin_directory_not_found(client, origin): url = reverse( "browse-origin-directory", query_params={"origin_url": origin["url"], "path": "/invalid/dir/path/"}, ) resp = check_html_get_response( client, url, status_code=404, template_used="browse/directory.html" ) assert re.search("Directory.*not found", resp.content.decode("utf-8")) def _add_empty_snapshot_origin(new_origin, archive_data): snapshot = Snapshot(branches={}) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=now(), type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=now(), + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) @pytest.mark.django_db @pytest.mark.parametrize("object_type", ["directory"]) @given(new_origin()) def test_browse_origin_content_directory_empty_snapshot( client, staff_user, archive_data, object_type, new_origin ): _add_empty_snapshot_origin(new_origin, archive_data) # to check proper generation of raw extrinsic metadata api links client.force_login(staff_user) url = reverse( f"browse-origin-{object_type}", query_params={"origin_url": new_origin.url, "path": "baz"}, ) resp = check_html_get_response( client, url, status_code=200, template_used=f"browse/{object_type}.html" ) assert re.search("snapshot.*is empty", resp.content.decode("utf-8")) def test_browse_directory_snapshot_not_found(client, mocker, origin): mock_get_snapshot_context = mocker.patch( "swh.web.browse.snapshot_context.get_snapshot_context" ) mock_get_snapshot_context.side_effect = NotFoundExc("Snapshot not found") url = reverse("browse-origin-directory", query_params={"origin_url": origin["url"]}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains(resp, "Snapshot not found", status_code=404) assert mock_get_snapshot_context.called @given(new_origin()) def test_origin_empty_snapshot(client, archive_data, new_origin): _add_empty_snapshot_origin(new_origin, archive_data) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) resp_content = resp.content.decode("utf-8") assert re.search("snapshot.*is empty", resp_content) assert not re.search("swh-tr-link", resp_content) @given(new_origin()) def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin): snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( - target="refs/head/master".encode(), target_type=TargetType.ALIAS, + target="refs/head/master".encode(), + target_type=TargetType.ALIAS, ), b"refs/head/master": None, } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=now(), type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=now(), + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( - "browse-origin-directory", query_params={"origin_url": new_origin.url}, + "browse-origin-directory", + query_params={"origin_url": new_origin.url}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) resp_content = resp.content.decode("utf-8") assert re.search("snapshot.*is empty", resp_content) assert not re.search("swh-tr-link", resp_content) def test_origin_release_browse(client, archive_data, origin_with_releases): origin_url = origin_with_releases["url"] snapshot = archive_data.snapshot_get_latest(origin_url) release = [ b for b in snapshot["branches"].values() if b["target_type"] == "release" ][-1] release_data = archive_data.release_get(release["target"]) revision_data = archive_data.revision_get(release_data["target"]) url = reverse( "browse-origin-directory", query_params={"origin_url": origin_url, "release": release_data["name"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, release_data["name"]) assert_contains(resp, release["target"]) swhid_context = { "origin": origin_url, "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.RELEASE, release_data["id"]), } swh_dir_id = gen_swhid( ObjectType.DIRECTORY, revision_data["directory"], metadata=swhid_context ) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) def test_origin_release_browse_not_found(client, origin_with_releases): invalid_release_name = "swh-foo-bar" url = reverse( "browse-origin-directory", query_params={ "origin_url": origin_with_releases["url"], "release": invalid_release_name, }, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert re.search( f"Release {invalid_release_name}.*not found", resp.content.decode("utf-8") ) @given(new_origin()) def test_origin_browse_directory_branch_with_non_resolvable_revision( - client, archive_data, unknown_revision, new_origin, + client, + archive_data, + unknown_revision, + new_origin, ): branch_name = "master" snapshot = Snapshot( branches={ branch_name.encode(): SnapshotBranch( - target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION, + target=hash_to_bytes(unknown_revision), + target_type=TargetType.REVISION, ) } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=now(), type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=now(), + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": branch_name}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains( resp, f"Revision {unknown_revision } could not be found in the archive." ) # no revision card assert_not_contains(resp, "swh-tip-revision") # no Download dropdown assert_not_contains(resp, "swh-vault-download") # no History link assert_not_contains(resp, "swh-tr-link") # no SWHIDs for directory and revision assert_not_contains(resp, "swh:1:dir:") assert_not_contains(resp, "swh:1:rev:") def test_origin_views_no_url_query_parameter(client): for browse_context in ( "directory", "visits", ): url = reverse(f"browse-origin-{browse_context}") resp = check_html_get_response( client, url, status_code=400, template_used="error.html" ) assert_contains( - resp, "An origin URL must be provided as query parameter.", status_code=400, + resp, + "An origin URL must be provided as query parameter.", + status_code=400, ) @given(new_origin()) @pytest.mark.parametrize("browse_context", ["log", "branches", "releases"]) def test_origin_view_redirects(client, browse_context, new_origin): query_params = {"origin_url": new_origin.url} url = reverse(f"browse-origin-{browse_context}", query_params=query_params) resp = check_html_get_response(client, url, status_code=301) assert resp["location"] == reverse( f"browse-snapshot-{browse_context}", query_params=query_params ) @given(new_origin()) @pytest.mark.parametrize("browse_context", ["content"]) def test_origin_content_view_redirects(client, browse_context, new_origin): query_params = {"origin_url": new_origin.url, "path": "test.txt"} url = reverse(f"browse-origin-{browse_context}", query_params=query_params) resp = check_html_get_response(client, url, status_code=301) assert resp["location"] == reverse( f"browse-{browse_context}", query_params=query_params ) @given(new_origin()) @pytest.mark.parametrize("browse_context", ["log", "branches", "releases"]) def test_origin_view_legacy_redirects(client, browse_context, new_origin): # Each legacy route corresponds to two URL patterns, testing both url_args = [ {"origin_url": new_origin.url}, {"origin_url": new_origin.url, "timestamp": "2021-01-23T22:24:10Z"}, ] params = {"extra-param1": "extra-param1", "extra-param2": "extra-param2"} for each_arg in url_args: url = reverse( f"browse-origin-{browse_context}-legacy", url_args=each_arg, query_params=params, ) resp = check_html_get_response(client, url, status_code=301) assert resp["location"] == reverse( f"browse-snapshot-{browse_context}", query_params={**each_arg, **params} ) @given(new_origin()) def test_origin_content_view_legacy_redirects(client, new_origin): url_args = [ {"origin_url": new_origin.url}, { "origin_url": new_origin.url, "path": "test.txt", "timestamp": "2021-01-23T22:24:10Z", }, {"origin_url": new_origin.url, "path": "test.txt"}, ] params = {"extra-param1": "extra-param1", "extra-param2": "extra-param2"} for each_arg in url_args: url = reverse( - "browse-origin-content-legacy", url_args=each_arg, query_params=params, + "browse-origin-content-legacy", + url_args=each_arg, + query_params=params, ) resp = check_html_get_response(client, url, status_code=301) assert resp["location"] == reverse( "browse-content", query_params={**each_arg, **params} ) def _origin_directory_view_test_helper( client, archive_data, origin_info, origin_visit, snapshot_sizes, origin_branches, origin_releases, root_directory_sha1, directory_entries, visit_id=None, timestamp=None, snapshot_id=None, path=None, ): dirs = [e for e in directory_entries if e["type"] in ("dir", "rev")] files = [e for e in directory_entries if e["type"] == "file"] if not visit_id and not snapshot_id: visit_id = origin_visit["visit"] query_params = {"origin_url": origin_info["url"]} if timestamp: query_params["timestamp"] = timestamp elif visit_id: query_params["visit_id"] = visit_id else: query_params["snapshot"] = snapshot_id if path: query_params["path"] = path url = reverse("browse-origin-directory", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, '<td class="swh-directory">', count=len(dirs)) assert_contains(resp, '<td class="swh-content">', count=len(files)) if timestamp: query_params["timestamp"] = format_utc_iso_date( parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ" ) for d in dirs: if d["type"] == "rev": dir_url = reverse("browse-revision", url_args={"sha1_git": d["target"]}) else: dir_path = d["name"] if path: dir_path = "%s/%s" % (path, d["name"]) query_params["path"] = dir_path - dir_url = reverse("browse-origin-directory", query_params=query_params,) + dir_url = reverse( + "browse-origin-directory", + query_params=query_params, + ) assert_contains(resp, dir_url) for f in files: file_path = f["name"] if path: file_path = "%s/%s" % (path, f["name"]) query_params["path"] = file_path file_url = reverse("browse-origin-content", query_params=query_params) assert_contains(resp, file_url) if "path" in query_params: del query_params["path"] root_dir_branch_url = reverse("browse-origin-directory", query_params=query_params) nb_bc_paths = 1 if path: nb_bc_paths = len(path.split("/")) + 1 assert_contains(resp, '<li class="swh-path">', count=nb_bc_paths) assert_contains( resp, '<a href="%s">%s</a>' % (root_dir_branch_url, root_directory_sha1[:7]) ) origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") if path: query_params["path"] = path assert_contains(resp, '<li class="swh-branch">', count=len(origin_branches)) for branch in origin_branches: query_params["branch"] = branch["name"] root_dir_branch_url = reverse( "browse-origin-directory", query_params=query_params ) assert_contains(resp, '<a href="%s">' % root_dir_branch_url) assert_contains(resp, '<li class="swh-release">', count=len(origin_releases)) query_params["branch"] = None for release in origin_releases: query_params["release"] = release["name"] root_dir_release_url = reverse( "browse-origin-directory", query_params=query_params ) assert_contains(resp, 'href="%s"' % root_dir_release_url) assert_contains(resp, "vault-cook-directory") assert_contains(resp, "vault-cook-revision") snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) swhid_context = { "origin": origin_info["url"], "visit": gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(ObjectType.REVISION, head_rev_id), "path": f"/{path}" if path else None, } swh_dir_id = gen_swhid( ObjectType.DIRECTORY, directory_entries[0]["dir_id"], metadata=swhid_context ) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) assert_contains(resp, "swh-take-new-snapshot") _check_origin_link(resp, origin_info["url"]) assert_not_contains(resp, "swh-metadata-popover") def _check_origin_link(resp, origin_url): browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin_url} ) assert_contains(resp, f'href="{browse_origin_url}"') def test_browse_pull_request_branch( client, archive_data, origin_with_pull_request_branches ): origin_url = origin_with_pull_request_branches.url snapshot = archive_data.snapshot_get_latest(origin_url) pr_branch = random.choice( [ branch for branch in snapshot["branches"].keys() if branch.startswith("refs/pull/") ] ) url = reverse( "browse-origin-directory", query_params={"origin_url": origin_url, "branch": pr_branch}, ) check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) diff --git a/swh/web/tests/browse/views/test_release.py b/swh/web/tests/browse/views/test_release.py index 35fe6395..7924e333 100644 --- a/swh/web/tests/browse/views/test_release.py +++ b/swh/web/tests/browse/views/test_release.py @@ -1,162 +1,164 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random from django.utils.html import escape from swh.model.swhids import ObjectType from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import format_utc_iso_date, reverse from swh.web.tests.django_asserts import assert_contains from swh.web.tests.utils import check_html_get_response def test_release_browse(client, archive_data, release): _release_browse_checks(client, release, archive_data) def test_release_browse_with_origin_snapshot( client, archive_data, origin_with_releases ): origin_url = origin_with_releases["url"] snapshot = archive_data.snapshot_get_latest(origin_url) release = random.choice( [ b["target"] for b in snapshot["branches"].values() if b["target_type"] == "release" ] ) _release_browse_checks(client, release, archive_data, origin_url=origin_url) _release_browse_checks(client, release, archive_data, snapshot_id=snapshot["id"]) _release_browse_checks( client, release, archive_data, origin_url=origin_url, snapshot_id=snapshot["id"], ) def test_release_browse_not_found(client, archive_data, unknown_release): url = reverse("browse-release", url_args={"sha1_git": unknown_release}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) err_msg = "Release with sha1_git %s not found" % unknown_release assert_contains(resp, err_msg, status_code=404) def test_release_uppercase(client, release): url = reverse( "browse-release-uppercase-checksum", url_args={"sha1_git": release.upper()} ) resp = check_html_get_response(client, url, status_code=302) redirect_url = reverse("browse-release", url_args={"sha1_git": release}) assert resp["location"] == redirect_url def _release_browse_checks( client, release, archive_data, origin_url=None, snapshot_id=None ): query_params = {"origin_url": origin_url, "snapshot": snapshot_id} url = reverse( "browse-release", url_args={"sha1_git": release}, query_params=query_params ) release_data = archive_data.release_get(release) release_id = release_data["id"] release_name = release_data["name"] author_name = release_data["author"]["name"] release_date = release_data["date"] message = release_data["message"] target_type = release_data["target_type"] target = release_data["target"] target_url = reverse( "browse-revision", url_args={"sha1_git": target}, query_params=query_params ) message_lines = message.split("\n") resp = check_html_get_response( client, url, status_code=200, template_used="browse/release.html" ) assert_contains(resp, author_name) assert_contains(resp, format_utc_iso_date(release_date)) assert_contains( resp, "<h6>%s</h6>%s" % (message_lines[0] or "None", "\n".join(message_lines[1:])), ) assert_contains(resp, release_id) assert_contains(resp, release_name) assert_contains(resp, target_type) assert_contains(resp, '<a href="%s">%s</a>' % (escape(target_url), target)) swh_rel_id = gen_swhid(ObjectType.RELEASE, release_id) swh_rel_id_url = reverse("browse-swhid", url_args={"swhid": swh_rel_id}) assert_contains(resp, swh_rel_id) assert_contains(resp, swh_rel_id_url) if origin_url: browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin_url} ) assert_contains(resp, f'href="{browse_origin_url}"') elif snapshot_id: swh_snp_id = gen_swhid(ObjectType.SNAPSHOT, snapshot_id) swh_snp_id_url = reverse("browse-swhid", url_args={"swhid": swh_snp_id}) assert_contains(resp, f'href="{swh_snp_id_url}"') if release_data["target_type"] == "revision": rev = archive_data.revision_get(release_data["target"]) rev_dir = rev["directory"] rev_metadata = {} dir_metadata = {} if origin_url: directory_url = reverse( "browse-origin-directory", query_params={ "origin_url": origin_url, "release": release_data["name"], "snapshot": snapshot_id, }, ) rev_metadata["origin"] = dir_metadata["origin"] = origin_url snapshot = archive_data.snapshot_get_latest(origin_url) rev_metadata["visit"] = dir_metadata["visit"] = gen_swhid( ObjectType.SNAPSHOT, snapshot["id"] ) dir_metadata["anchor"] = gen_swhid(ObjectType.RELEASE, release_id) elif snapshot_id: directory_url = reverse( "browse-snapshot-directory", url_args={"snapshot_id": snapshot_id}, - query_params={"release": release_data["name"],}, + query_params={ + "release": release_data["name"], + }, ) rev_metadata["visit"] = dir_metadata["visit"] = gen_swhid( ObjectType.SNAPSHOT, snapshot_id ) dir_metadata["anchor"] = gen_swhid(ObjectType.RELEASE, release_id) else: directory_url = reverse("browse-directory", url_args={"sha1_git": rev_dir}) assert_contains(resp, escape(directory_url)) swh_rev_id = gen_swhid(ObjectType.REVISION, rev["id"], metadata=rev_metadata) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) assert_contains(resp, swh_rev_id_url) swh_dir_id = gen_swhid(ObjectType.DIRECTORY, rev_dir, metadata=dir_metadata) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id_url) diff --git a/swh/web/tests/browse/views/test_revision.py b/swh/web/tests/browse/views/test_revision.py index 34d65384..311ef84d 100644 --- a/swh/web/tests/browse/views/test_revision.py +++ b/swh/web/tests/browse/views/test_revision.py @@ -1,335 +1,346 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import random from hypothesis import given from django.utils.html import escape from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import Revision, RevisionType, TimestampWithTimezone from swh.model.swhids import ObjectType from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import format_utc_iso_date, parse_iso8601_date_to_utc, reverse from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.strategies import new_origin, new_person, new_swh_date from swh.web.tests.utils import check_html_get_response def test_revision_browse(client, archive_data, revision): _revision_browse_checks(client, archive_data, revision) def test_revision_origin_snapshot_browse(client, archive_data, swh_scheduler, origin): snapshot = archive_data.snapshot_get_latest(origin["url"]) revision = archive_data.snapshot_get_head(snapshot) _revision_browse_checks(client, archive_data, revision, origin_url=origin["url"]) _revision_browse_checks(client, archive_data, revision, snapshot=snapshot) _revision_browse_checks( - client, archive_data, revision, origin_url=origin["url"], snapshot=snapshot, + client, + archive_data, + revision, + origin_url=origin["url"], + snapshot=snapshot, ) revision = random.choice(archive_data.revision_log(revision))["id"] _revision_browse_checks(client, archive_data, revision, origin_url=origin["url"]) def test_revision_log_browse(client, archive_data, revision): per_page = 10 revision_log = archive_data.revision_log(revision) revision_log_sorted = sorted( revision_log, key=lambda rev: -parse_iso8601_date_to_utc(rev["committer_date"]).timestamp(), ) url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"per_page": per_page}, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, - query_params={"offset": per_page, "per_page": per_page,}, + query_params={ + "offset": per_page, + "per_page": per_page, + }, ) nb_log_entries = per_page if len(revision_log_sorted) < per_page: nb_log_entries = len(revision_log_sorted) resp = check_html_get_response( client, url, status_code=200, template_used="browse/revision-log.html" ) assert_contains(resp, '<tr class="swh-revision-log-entry', count=nb_log_entries) assert_contains(resp, '<a class="page-link">Newer</a>') if len(revision_log_sorted) > per_page: assert_contains( - resp, '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), + resp, + '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), ) for log in revision_log_sorted[:per_page]: revision_url = reverse("browse-revision", url_args={"sha1_git": log["id"]}) assert_contains(resp, log["id"][:7]) assert_contains(resp, log["author"]["name"]) assert_contains(resp, format_utc_iso_date(log["date"])) assert_contains(resp, escape(log["message"])) assert_contains(resp, format_utc_iso_date(log["committer_date"])) assert_contains(resp, revision_url) if len(revision_log_sorted) <= per_page: return resp = check_html_get_response( client, next_page_url, status_code=200, template_used="browse/revision-log.html" ) prev_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": 0, "per_page": per_page}, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": 2 * per_page, "per_page": per_page}, ) nb_log_entries = len(revision_log_sorted) - per_page if nb_log_entries > per_page: nb_log_entries = per_page assert_contains(resp, '<tr class="swh-revision-log-entry', count=nb_log_entries) assert_contains( resp, '<a class="page-link" href="%s">Newer</a>' % escape(prev_page_url) ) if len(revision_log_sorted) > 2 * per_page: assert_contains( - resp, '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), + resp, + '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), ) if len(revision_log_sorted) <= 2 * per_page: return resp = check_html_get_response( client, next_page_url, status_code=200, template_used="browse/revision-log.html" ) prev_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": per_page, "per_page": per_page}, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"offset": 3 * per_page, "per_page": per_page}, ) nb_log_entries = len(revision_log_sorted) - 2 * per_page if nb_log_entries > per_page: nb_log_entries = per_page assert_contains(resp, '<tr class="swh-revision-log-entry', count=nb_log_entries) assert_contains( resp, '<a class="page-link" href="%s">Newer</a>' % escape(prev_page_url) ) if len(revision_log_sorted) > 3 * per_page: assert_contains( - resp, '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), + resp, + '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), ) @given(new_origin()) def test_revision_request_errors(client, revision, unknown_revision, new_origin): url = reverse("browse-revision", url_args={"sha1_git": unknown_revision}) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains( resp, "Revision with sha1_git %s not found" % unknown_revision, status_code=404 ) url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params={"origin_url": new_origin.url}, ) resp = check_html_get_response( client, url, status_code=404, template_used="error.html" ) assert_contains( resp, "the origin mentioned in your request" " appears broken", status_code=404 ) def test_revision_uppercase(client, revision): url = reverse( "browse-revision-uppercase-checksum", url_args={"sha1_git": revision.upper()} ) resp = check_html_get_response(client, url, status_code=302) redirect_url = reverse("browse-revision", url_args={"sha1_git": revision}) assert resp["location"] == redirect_url def _revision_browse_checks( client, archive_data, revision, origin_url=None, snapshot=None ): query_params = {} if origin_url: query_params["origin_url"] = origin_url if snapshot: query_params["snapshot"] = snapshot["id"] url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params=query_params ) revision_data = archive_data.revision_get(revision) author_name = revision_data["author"]["name"] committer_name = revision_data["committer"]["name"] dir_id = revision_data["directory"] if origin_url: snapshot = archive_data.snapshot_get_latest(origin_url) history_url = reverse( - "browse-origin-log", query_params={"revision": revision, **query_params}, + "browse-origin-log", + query_params={"revision": revision, **query_params}, ) elif snapshot: history_url = reverse( "browse-snapshot-log", url_args={"snapshot_id": snapshot["id"]}, query_params={"revision": revision}, ) else: history_url = reverse("browse-revision-log", url_args={"sha1_git": revision}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/revision.html" ) assert_contains(resp, author_name) assert_contains(resp, committer_name) assert_contains(resp, history_url) for parent in revision_data["parents"]: parent_url = reverse( "browse-revision", url_args={"sha1_git": parent}, query_params=query_params ) assert_contains(resp, '<a href="%s">%s</a>' % (escape(parent_url), parent[:7])) author_date = revision_data["date"] committer_date = revision_data["committer_date"] message_lines = revision_data["message"].split("\n") assert_contains(resp, format_utc_iso_date(author_date)) assert_contains(resp, format_utc_iso_date(committer_date)) assert_contains(resp, escape(message_lines[0])) assert_contains(resp, escape("\n".join(message_lines[1:]))) assert_contains(resp, "vault-cook-directory") assert_contains(resp, "vault-cook-revision") swh_rev_id = gen_swhid(ObjectType.REVISION, revision) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) assert_contains(resp, swh_rev_id) assert_contains(resp, swh_rev_id_url) swh_dir_id = gen_swhid(ObjectType.DIRECTORY, dir_id) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) if origin_url: assert_contains(resp, "swh-take-new-snapshot") swh_rev_id = gen_swhid(ObjectType.REVISION, revision) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) if origin_url: browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin_url} ) assert_contains(resp, f'href="{browse_origin_url}"') elif snapshot: swh_snp_id = gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]) swh_snp_id_url = reverse("browse-swhid", url_args={"swhid": swh_snp_id}) assert_contains(resp, f'href="{swh_snp_id_url}"') swhid_context = {} if origin_url: swhid_context["origin"] = origin_url if snapshot: swhid_context["visit"] = gen_swhid(ObjectType.SNAPSHOT, snapshot["id"]) swh_rev_id = gen_swhid(ObjectType.REVISION, revision, metadata=swhid_context) swh_rev_id_url = reverse("browse-swhid", url_args={"swhid": swh_rev_id}) assert_contains(resp, swh_rev_id) assert_contains(resp, swh_rev_id_url) swhid_context["anchor"] = gen_swhid(ObjectType.REVISION, revision) swh_dir_id = gen_swhid(ObjectType.DIRECTORY, dir_id, metadata=swhid_context) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) def test_revision_invalid_path(client, archive_data, revision): path = "foo/bar" url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params={"path": path} ) resp = check_html_get_response( client, url, status_code=404, template_used="browse/revision.html" ) directory = archive_data.revision_get(revision)["directory"] error_message = ( f"Directory entry with path {path} from root directory {directory} not found" ) assert_contains(resp, error_message, status_code=404) assert_not_contains(resp, "swh-metadata-popover", status_code=404) @given(new_person(), new_swh_date()) def test_revision_metadata_display(archive_data, client, directory, person, date): metadata = {"foo": "bar"} revision = Revision( directory=hash_to_bytes(directory), author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, metadata=metadata, ) archive_data.revision_add([revision]) url = reverse("browse-revision", url_args={"sha1_git": hash_to_hex(revision.id)}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/revision.html" ) assert_contains(resp, "swh-metadata-popover") assert_contains(resp, escape(json.dumps(metadata, indent=4))) diff --git a/swh/web/tests/browse/views/test_snapshot.py b/swh/web/tests/browse/views/test_snapshot.py index e4e929c8..ad60ef03 100644 --- a/swh/web/tests/browse/views/test_snapshot.py +++ b/swh/web/tests/browse/views/test_snapshot.py @@ -1,427 +1,448 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re import string from dateutil import parser from hypothesis import given import pytest from django.utils.html import escape from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, OriginVisit, OriginVisitStatus, Release, Revision, RevisionType, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.storage.utils import now from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.common.utils import reverse from swh.web.tests.data import random_sha1 from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.strategies import new_origin, new_person, new_swh_date, visit_dates from swh.web.tests.utils import check_html_get_response @pytest.mark.parametrize( "browse_context,template_used", [ ("log", "revision-log.html"), ("branches", "branches.html"), ("releases", "releases.html"), ], ) def test_snapshot_browse_with_id(client, browse_context, template_used, snapshot): url = reverse( f"browse-snapshot-{browse_context}", url_args={"snapshot_id": snapshot} ) resp = check_html_get_response( client, url, status_code=200, template_used=f"browse/{template_used}" ) assert_contains(resp, f"swh:1:snp:{snapshot}") @pytest.mark.parametrize("browse_context", ["log", "branches", "releases"]) def test_snapshot_browse_with_id_and_origin( client, browse_context, archive_data, origin ): snapshot = archive_data.snapshot_get_latest(origin["url"]) url = reverse( f"browse-snapshot-{browse_context}", url_args={"snapshot_id": snapshot["id"]}, query_params={"origin_url": origin["url"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="includes/snapshot-context.html" ) assert_contains(resp, origin["url"]) @pytest.mark.parametrize("browse_context", ["log", "branches", "releases"]) def test_snapshot_browse_with_id_origin_and_timestamp( client, browse_context, archive_data, origin_with_multiple_visits ): visit = archive_data.origin_visit_get(origin_with_multiple_visits["url"])[0] url = reverse( f"browse-snapshot-{browse_context}", url_args={"snapshot_id": visit["snapshot"]}, query_params={"origin_url": visit["origin"], "timestamp": visit["date"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="includes/snapshot-context.html" ) requested_time = parser.parse(visit["date"]).strftime("%d %B %Y, %H:%M") assert_contains(resp, requested_time) assert_contains(resp, visit["origin"]) @pytest.mark.parametrize("browse_context", ["log", "branches", "releases"]) def test_snapshot_browse_without_id(client, browse_context, archive_data, origin): url = reverse( f"browse-snapshot-{browse_context}", query_params={"origin_url": origin["url"]} ) # This will be redirected to /snapshot/<latest_snapshot_id>/log - resp = check_html_get_response(client, url, status_code=302,) + resp = check_html_get_response( + client, + url, + status_code=302, + ) snapshot = archive_data.snapshot_get_latest(origin["url"]) assert resp.url == reverse( f"browse-snapshot-{browse_context}", url_args={"snapshot_id": snapshot["id"]}, query_params={"origin_url": origin["url"]}, ) @pytest.mark.parametrize("browse_context", ["log", "branches", "releases"]) def test_snapshot_browse_without_id_and_origin(client, browse_context): url = reverse(f"browse-snapshot-{browse_context}") - resp = check_html_get_response(client, url, status_code=400,) + resp = check_html_get_response( + client, + url, + status_code=400, + ) # assert_contains works only with a success response, using re.search instead assert re.search( "An origin URL must be provided as a query parameter", resp.content.decode("utf-8"), ) def test_snapshot_browse_branches(client, archive_data, origin): snapshot = archive_data.snapshot_get_latest(origin["url"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) snapshot_content = process_snapshot_branches(snapshot) _origin_branches_test_helper( client, origin, snapshot_content, snapshot_sizes, snapshot_id=snapshot["id"] ) def _origin_branches_test_helper( client, origin_info, origin_snapshot, snapshot_sizes, snapshot_id ): query_params = {"origin_url": origin_info["url"], "snapshot": snapshot_id} url = reverse( "browse-snapshot-branches", url_args={"snapshot_id": snapshot_id}, query_params=query_params, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/branches.html" ) origin_branches = origin_snapshot[0] origin_releases = origin_snapshot[1] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}">') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '<tr class="swh-branch-entry', count=len(origin_branches)) for branch in origin_branches: browse_branch_url = reverse( "browse-origin-directory", query_params={"branch": branch["name"], **query_params}, ) assert_contains(resp, '<a href="%s">' % escape(browse_branch_url)) browse_revision_url = reverse( "browse-revision", url_args={"sha1_git": branch["revision"]}, query_params=query_params, ) assert_contains(resp, '<a href="%s">' % escape(browse_revision_url)) _check_origin_link(resp, origin_info["url"]) def _check_origin_link(resp, origin_url): browse_origin_url = reverse( "browse-origin", query_params={"origin_url": origin_url} ) assert_contains(resp, f'href="{browse_origin_url}"') @given( - new_origin(), visit_dates(), + new_origin(), + visit_dates(), ) def test_snapshot_branches_pagination_with_alias( - client, archive_data, mocker, release, revisions_list, new_origin, visit_dates, + client, + archive_data, + mocker, + release, + revisions_list, + new_origin, + visit_dates, ): """ When a snapshot contains a branch or a release alias, pagination links in the branches / releases view should be displayed. """ revisions = revisions_list(size=10) mocker.patch("swh.web.browse.snapshot_context.PER_PAGE", len(revisions) / 2) snp_dict = {"branches": {}, "id": hash_to_bytes(random_sha1())} for i in range(len(revisions)): branch = "".join(random.choices(string.ascii_lowercase, k=8)) snp_dict["branches"][branch.encode()] = { "target_type": "revision", "target": hash_to_bytes(revisions[i]), } release_name = "".join(random.choices(string.ascii_lowercase, k=8)) snp_dict["branches"][b"RELEASE_ALIAS"] = { "target_type": "alias", "target": release_name.encode(), } snp_dict["branches"][release_name.encode()] = { "target_type": "release", "target": hash_to_bytes(release), } archive_data.origin_add([new_origin]) archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_dates[0], + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) snapshot = archive_data.snapshot_get_latest(new_origin.url) url = reverse( "browse-snapshot-branches", url_args={"snapshot_id": snapshot["id"]}, query_params={"origin_url": new_origin.url}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/branches.html" ) assert_contains(resp, '<ul class="pagination') def test_pull_request_branches_filtering( client, origin_with_pull_request_branches, archive_data ): origin_url = origin_with_pull_request_branches.url # check no pull request branches are displayed in the Branches / Releases dropdown url = reverse("browse-origin-directory", query_params={"origin_url": origin_url}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_not_contains(resp, "refs/pull/") snapshot = archive_data.snapshot_get_latest(origin_url) # check no pull request branches are displayed in the branches view url = reverse( "browse-snapshot-branches", url_args={"snapshot_id": snapshot["id"]}, query_params={"origin_url": origin_url}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/branches.html" ) assert_not_contains(resp, "refs/pull/") def test_snapshot_browse_releases(client, archive_data, origin): origin_visits = archive_data.origin_visit_get(origin["url"]) visit = origin_visits[-1] snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_sizes = archive_data.snapshot_count_branches(snapshot["id"]) snapshot_content = process_snapshot_branches(snapshot) _origin_releases_test_helper( client, origin, snapshot_content, snapshot_sizes, snapshot_id=visit["snapshot"] ) def _origin_releases_test_helper( client, origin_info, origin_snapshot, snapshot_sizes, snapshot_id=None ): query_params = {"origin_url": origin_info["url"], "snapshot": snapshot_id} url = reverse( "browse-snapshot-releases", url_args={"snapshot_id": snapshot_id}, query_params=query_params, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/releases.html" ) origin_releases = origin_snapshot[1] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']}") assert_contains(resp, '<tr class="swh-release-entry', count=nb_releases) assert_contains(resp, 'title="The release', count=nb_releases) for release in origin_releases: query_params["release"] = release["name"] browse_release_url = reverse( "browse-release", url_args={"sha1_git": release["id"]}, query_params=query_params, ) browse_revision_url = reverse( "browse-revision", url_args={"sha1_git": release["target"]}, query_params=query_params, ) assert_contains(resp, '<a href="%s">' % escape(browse_release_url)) assert_contains(resp, '<a href="%s">' % escape(browse_revision_url)) _check_origin_link(resp, origin_info["url"]) def test_snapshot_content_redirect(client, snapshot): qry = {"extra-arg": "extra"} url = reverse( "browse-snapshot-content", url_args={"snapshot_id": snapshot}, query_params=qry ) resp = check_html_get_response(client, url, status_code=301) assert resp.url == reverse( "browse-content", query_params={**{"snapshot_id": snapshot}, **qry} ) def test_snapshot_content_legacy_redirect(client, snapshot): qry = {"extra-arg": "extra"} url_args = {"snapshot_id": snapshot, "path": "test.txt"} url = reverse("browse-snapshot-content-legacy", url_args=url_args, query_params=qry) resp = check_html_get_response(client, url, status_code=301) assert resp.url == reverse("browse-content", query_params={**url_args, **qry}) def test_browse_snapshot_log_no_revisions(client, archive_data, directory): release_name = "v1.0.0" release = Release( name=release_name.encode(), message=f"release {release_name}".encode(), target=hash_to_bytes(directory), target_type=ObjectType.DIRECTORY, synthetic=True, ) archive_data.release_add([release]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target=release_name.encode(), target_type=TargetType.ALIAS ), release_name.encode(): SnapshotBranch( target=release.id, target_type=TargetType.RELEASE ), }, ) archive_data.snapshot_add([snapshot]) snp_url = reverse( "browse-snapshot-directory", url_args={"snapshot_id": snapshot.id.hex()} ) log_url = reverse( "browse-snapshot-log", url_args={"snapshot_id": snapshot.id.hex()} ) resp = check_html_get_response( client, snp_url, status_code=200, template_used="browse/directory.html" ) assert_not_contains(resp, log_url) resp = check_html_get_response( client, log_url, status_code=404, template_used="error.html" ) assert_contains( resp, "No revisions history found in the current snapshot context.", status_code=404, ) @given(new_person(), new_swh_date()) def test_browse_snapshot_log_when_revisions( client, archive_data, directory, person, date ): revision = Revision( directory=hash_to_bytes(directory), author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([revision]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION ), }, ) archive_data.snapshot_add([snapshot]) snp_url = reverse( "browse-snapshot-directory", url_args={"snapshot_id": snapshot.id.hex()} ) log_url = reverse( "browse-snapshot-log", url_args={"snapshot_id": snapshot.id.hex()} ) resp = check_html_get_response( client, snp_url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, log_url) diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py index 80b76396..2812176c 100644 --- a/swh/web/tests/common/test_archive.py +++ b/swh/web/tests/common/test_archive.py @@ -1,1215 +1,1253 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import datetime import hashlib import itertools import random from hypothesis import given, settings import pytest from swh.model.from_disk import DentryPerms from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( Directory, DirectoryEntry, Origin, OriginVisit, OriginVisitStatus, Revision, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import ObjectType from swh.storage.utils import now from swh.web.common import archive from swh.web.common.exc import BadInputExc, NotFoundExc from swh.web.common.typing import OriginInfo, PagedResult from swh.web.tests.conftest import ctags_json_missing, fossology_missing from swh.web.tests.data import random_content, random_sha1 from swh.web.tests.strategies import new_origin, new_revision, visit_dates def test_lookup_multiple_hashes_all_present(contents): input_data = [] expected_output = [] for cnt in contents: input_data.append({"sha1": cnt["sha1"]}) expected_output.append({"sha1": cnt["sha1"], "found": True}) assert archive.lookup_multiple_hashes(input_data) == expected_output def test_lookup_multiple_hashes_some_missing(contents, unknown_contents): input_contents = list(itertools.chain(contents, unknown_contents)) random.shuffle(input_contents) input_data = [] expected_output = [] for cnt in input_contents: input_data.append({"sha1": cnt["sha1"]}) expected_output.append({"sha1": cnt["sha1"], "found": cnt in contents}) assert archive.lookup_multiple_hashes(input_data) == expected_output def test_lookup_hash_does_not_exist(): unknown_content_ = random_content() actual_lookup = archive.lookup_hash("sha1_git:%s" % unknown_content_["sha1_git"]) assert actual_lookup == {"found": None, "algo": "sha1_git"} def test_lookup_hash_exist(archive_data, content): actual_lookup = archive.lookup_hash("sha1:%s" % content["sha1"]) content_metadata = archive_data.content_get(content["sha1"]) assert {"found": content_metadata, "algo": "sha1"} == actual_lookup def test_search_hash_does_not_exist(): unknown_content_ = random_content() actual_lookup = archive.search_hash("sha1_git:%s" % unknown_content_["sha1_git"]) assert {"found": False} == actual_lookup def test_search_hash_exist(content): actual_lookup = archive.search_hash("sha1:%s" % content["sha1"]) assert {"found": True} == actual_lookup @pytest.mark.skipif( ctags_json_missing, reason="requires ctags with json output support" ) def test_lookup_content_ctags(indexer_data, contents_with_ctags): content_sha1 = random.choice(contents_with_ctags["sha1s"]) indexer_data.content_add_ctags(content_sha1) actual_ctags = list(archive.lookup_content_ctags("sha1:%s" % content_sha1)) expected_data = list(indexer_data.content_get_ctags(content_sha1)) for ctag in expected_data: ctag["id"] = content_sha1 assert actual_ctags == expected_data def test_lookup_content_ctags_no_hash(): unknown_content_ = random_content() actual_ctags = list( archive.lookup_content_ctags("sha1:%s" % unknown_content_["sha1"]) ) assert actual_ctags == [] def test_lookup_content_filetype(indexer_data, content): indexer_data.content_add_mimetype(content["sha1"]) actual_filetype = archive.lookup_content_filetype(content["sha1"]) expected_filetype = indexer_data.content_get_mimetype(content["sha1"]) assert actual_filetype == expected_filetype def test_lookup_expression(indexer_data, contents_with_ctags): per_page = 10 expected_ctags = [] for content_sha1 in contents_with_ctags["sha1s"]: if len(expected_ctags) == per_page: break indexer_data.content_add_ctags(content_sha1) for ctag in indexer_data.content_get_ctags(content_sha1): if len(expected_ctags) == per_page: break if ctag["name"] == contents_with_ctags["symbol_name"]: del ctag["id"] ctag["sha1"] = content_sha1 expected_ctags.append(ctag) actual_ctags = list( archive.lookup_expression( contents_with_ctags["symbol_name"], last_sha1=None, per_page=10 ) ) assert actual_ctags == expected_ctags def test_lookup_expression_no_result(): expected_ctags = [] actual_ctags = list( archive.lookup_expression("barfoo", last_sha1=None, per_page=10) ) assert actual_ctags == expected_ctags @pytest.mark.skipif(fossology_missing, reason="requires fossology-nomossa installed") def test_lookup_content_license(indexer_data, content): indexer_data.content_add_license(content["sha1"]) actual_license = archive.lookup_content_license(content["sha1"]) expected_license = indexer_data.content_get_license(content["sha1"]) assert actual_license == expected_license def test_stat_counters(archive_data): actual_stats = archive.stat_counters() assert actual_stats == archive_data.stat_counters() @given(new_origin(), visit_dates()) def test_lookup_origin_visits(subtest, new_origin, visit_dates): # ensure archive_data fixture will be reset between each hypothesis # example test run @subtest def test_inner(archive_data): archive_data.origin_add([new_origin]) archive_data.origin_visit_add( [ - OriginVisit(origin=new_origin.url, date=ts, type="git",) + OriginVisit( + origin=new_origin.url, + date=ts, + type="git", + ) for ts in visit_dates ] ) actual_origin_visits = list( archive.lookup_origin_visits(new_origin.url, per_page=100) ) expected_visits = archive_data.origin_visit_get(new_origin.url) for expected_visit in expected_visits: expected_visit["origin"] = new_origin.url assert actual_origin_visits == expected_visits @given(new_origin(), visit_dates()) def test_lookup_origin_visit(archive_data, new_origin, visit_dates): archive_data.origin_add([new_origin]) visits = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=ts, type="git",) for ts in visit_dates] + [ + OriginVisit( + origin=new_origin.url, + date=ts, + type="git", + ) + for ts in visit_dates + ] ) visit = random.choice(visits).visit actual_origin_visit = archive.lookup_origin_visit(new_origin.url, visit) expected_visit = dict(archive_data.origin_visit_get_by(new_origin.url, visit)) assert actual_origin_visit == expected_visit @given(new_origin(), visit_dates()) @settings(max_examples=1) def test_origin_visit_find_by_date_no_result(archive_data, new_origin, visit_dates): """No visit registered in storage for an origin should return no visit""" archive_data.origin_add([new_origin]) for visit_date in visit_dates: # No visit yet, so nothing will get returned actual_origin_visit_status = archive.origin_visit_find_by_date( new_origin.url, visit_date ) assert actual_origin_visit_status is None @settings(max_examples=1) @given(new_origin()) def test_origin_visit_find_by_date(archive_data, new_origin): # Add origin and two visits archive_data.origin_add([new_origin]) pivot_date = now() # First visit one hour before pivot date first_visit_date = pivot_date - datetime.timedelta(hours=1) # Second visit two hours after pivot date second_visit_date = pivot_date + datetime.timedelta(hours=2) visits = archive_data.origin_visit_add( [ - OriginVisit(origin=new_origin.url, date=visit_date, type="git",) + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) for visit_date in [first_visit_date, second_visit_date] ] ) # Finalize visits visit_statuses = [] for visit in visits: visit_statuses.append( OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit.date + datetime.timedelta(hours=1), type=visit.type, status="full", snapshot=None, ) ) archive_data.origin_visit_status_add(visit_statuses) # Check correct visit is returned when searching by date for search_date, greater_or_equal, expected_visit in [ (first_visit_date, True, 1), (pivot_date, True, 2), (pivot_date, False, 1), (second_visit_date, True, 2), ]: origin_visit = archive.origin_visit_find_by_date( new_origin.url, search_date, greater_or_equal ) assert origin_visit["visit"] == expected_visit @given(new_origin()) def test_lookup_origin(archive_data, new_origin): archive_data.origin_add([new_origin]) actual_origin = archive.lookup_origin({"url": new_origin.url}) expected_origin = archive_data.origin_get([new_origin.url])[0] assert actual_origin == expected_origin def test_lookup_origin_snapshots(archive_data, origin_with_multiple_visits): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) origin_snapshots = archive.lookup_origin_snapshots(origin_with_multiple_visits) assert set(origin_snapshots) == {v["snapshot"] for v in visits} def test_lookup_release_ko_id_checksum_not_a_sha1(invalid_sha1): with pytest.raises(BadInputExc) as e: archive.lookup_release(invalid_sha1) assert e.match("Invalid checksum") def test_lookup_release_ko_id_checksum_too_long(sha256): with pytest.raises(BadInputExc) as e: archive.lookup_release(sha256) assert e.match("Only sha1_git is supported.") def test_lookup_release_multiple(archive_data, releases): actual_releases = list(archive.lookup_release_multiple(releases)) expected_releases = [] for release_id in releases: release_info = archive_data.release_get(release_id) expected_releases.append(release_info) assert actual_releases == expected_releases def test_lookup_release_multiple_none_found(): unknown_releases_ = [random_sha1(), random_sha1(), random_sha1()] actual_releases = list(archive.lookup_release_multiple(unknown_releases_)) assert actual_releases == [None] * len(unknown_releases_) def test_lookup_directory_with_path_not_found(directory): path = "some/invalid/path/here" with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_path(directory, path) assert e.match( f"Directory entry with path {path} from root directory {directory} not found" ) def test_lookup_directory_with_path_found(archive_data, directory): directory_content = archive_data.directory_ls(directory) directory_entry = random.choice(directory_content) path = directory_entry["name"] actual_result = archive.lookup_directory_with_path(directory, path) assert actual_result == directory_entry def test_lookup_release(archive_data, release): actual_release = archive.lookup_release(release) assert actual_release == archive_data.release_get(release) def test_lookup_revision_with_context_ko_not_a_sha1(revision, invalid_sha1, sha256): sha1_git_root = revision sha1_git = invalid_sha1 with pytest.raises(BadInputExc) as e: archive.lookup_revision_with_context(sha1_git_root, sha1_git) assert e.match("Invalid checksum query string") sha1_git = sha256 with pytest.raises(BadInputExc) as e: archive.lookup_revision_with_context(sha1_git_root, sha1_git) assert e.match("Only sha1_git is supported") def test_lookup_revision_with_context_ko_sha1_git_does_not_exist( revision, unknown_revision ): sha1_git_root = revision sha1_git = unknown_revision with pytest.raises(NotFoundExc) as e: archive.lookup_revision_with_context(sha1_git_root, sha1_git) assert e.match("Revision %s not found" % sha1_git) def test_lookup_revision_with_context_ko_root_sha1_git_does_not_exist( revision, unknown_revision ): sha1_git_root = unknown_revision sha1_git = revision with pytest.raises(NotFoundExc) as e: archive.lookup_revision_with_context(sha1_git_root, sha1_git) assert e.match("Revision root %s not found" % sha1_git_root) def test_lookup_revision_with_context(archive_data, ancestor_revisions): sha1_git = ancestor_revisions["sha1_git"] root_sha1_git = ancestor_revisions["sha1_git_root"] for sha1_git_root in (root_sha1_git, {"id": hash_to_bytes(root_sha1_git)}): actual_revision = archive.lookup_revision_with_context(sha1_git_root, sha1_git) children = [] for rev in archive_data.revision_log(root_sha1_git): for p_rev in rev["parents"]: p_rev_hex = hash_to_hex(p_rev) if p_rev_hex == sha1_git: children.append(rev["id"]) expected_revision = archive_data.revision_get(sha1_git) expected_revision["children"] = children assert actual_revision == expected_revision def test_lookup_revision_with_context_ko(non_ancestor_revisions): sha1_git = non_ancestor_revisions["sha1_git"] root_sha1_git = non_ancestor_revisions["sha1_git_root"] with pytest.raises(NotFoundExc) as e: archive.lookup_revision_with_context(root_sha1_git, sha1_git) assert e.match("Revision %s is not an ancestor of %s" % (sha1_git, root_sha1_git)) def test_lookup_directory_with_revision_not_found(): unknown_revision_ = random_sha1() with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_revision(unknown_revision_) assert e.match("Revision %s not found" % unknown_revision_) @given(new_revision()) def test_lookup_directory_with_revision_unknown_content(archive_data, new_revision): unknown_content_ = random_content() dir_path = "README.md" # A directory that points to unknown content dir = Directory( entries=( DirectoryEntry( name=bytes(dir_path.encode("utf-8")), type="file", target=hash_to_bytes(unknown_content_["sha1_git"]), perms=DentryPerms.content, ), ) ) # Create a revision that points to a directory # Which points to unknown content new_revision = new_revision.to_dict() new_revision["directory"] = dir.id del new_revision["id"] new_revision = Revision.from_dict(new_revision) # Add the directory and revision in mem archive_data.directory_add([dir]) archive_data.revision_add([new_revision]) new_revision_id = hash_to_hex(new_revision.id) with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_revision(new_revision_id, dir_path) assert e.match("Content not found for revision %s" % new_revision_id) def test_lookup_directory_with_revision_ko_path_to_nowhere(revision): invalid_path = "path/to/something/unknown" with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_revision(revision, invalid_path) assert e.match("Directory or File") assert e.match(invalid_path) assert e.match("revision %s" % revision) assert e.match("not found") def test_lookup_directory_with_revision_submodules( archive_data, revision_with_submodules ): rev_sha1_git = revision_with_submodules["rev_sha1_git"] rev_dir_path = revision_with_submodules["rev_dir_rev_path"] actual_data = archive.lookup_directory_with_revision(rev_sha1_git, rev_dir_path) revision = archive_data.revision_get(revision_with_submodules["rev_sha1_git"]) directory = archive_data.directory_ls(revision["directory"]) rev_entry = next(e for e in directory if e["name"] == rev_dir_path) expected_data = { "content": archive_data.revision_get(rev_entry["target"]), "path": rev_dir_path, "revision": rev_sha1_git, "type": "rev", } assert actual_data == expected_data def test_lookup_directory_with_revision_without_path(archive_data, revision): actual_directory_entries = archive.lookup_directory_with_revision(revision) revision_data = archive_data.revision_get(revision) expected_directory_entries = archive_data.directory_ls(revision_data["directory"]) assert actual_directory_entries["type"] == "dir" assert actual_directory_entries["content"] == expected_directory_entries def test_lookup_directory_with_revision_with_path(archive_data, revision): rev_data = archive_data.revision_get(revision) dir_entries = [ e for e in archive_data.directory_ls(rev_data["directory"]) if e["type"] in ("file", "dir") ] expected_dir_entry = random.choice(dir_entries) actual_dir_entry = archive.lookup_directory_with_revision( revision, expected_dir_entry["name"] ) assert actual_dir_entry["type"] == expected_dir_entry["type"] assert actual_dir_entry["revision"] == revision assert actual_dir_entry["path"] == expected_dir_entry["name"] if actual_dir_entry["type"] == "file": del actual_dir_entry["content"]["checksums"]["blake2s256"] for key in ("checksums", "status", "length"): assert actual_dir_entry["content"][key] == expected_dir_entry[key] else: sub_dir_entries = archive_data.directory_ls(expected_dir_entry["target"]) assert actual_dir_entry["content"] == sub_dir_entries def test_lookup_directory_with_revision_with_path_to_file_and_data( archive_data, revision ): rev_data = archive_data.revision_get(revision) dir_entries = [ e for e in archive_data.directory_ls(rev_data["directory"]) if e["type"] == "file" ] expected_dir_entry = random.choice(dir_entries) expected_data = archive_data.content_get_data( expected_dir_entry["checksums"]["sha1"] ) actual_dir_entry = archive.lookup_directory_with_revision( revision, expected_dir_entry["name"], with_data=True ) assert actual_dir_entry["type"] == expected_dir_entry["type"] assert actual_dir_entry["revision"] == revision assert actual_dir_entry["path"] == expected_dir_entry["name"] del actual_dir_entry["content"]["checksums"]["blake2s256"] for key in ("checksums", "status", "length"): assert actual_dir_entry["content"][key] == expected_dir_entry[key] assert actual_dir_entry["content"]["data"] == expected_data["data"] def test_lookup_revision(archive_data, revision): actual_revision = archive.lookup_revision(revision) assert actual_revision == archive_data.revision_get(revision) @given(new_revision()) def test_lookup_revision_invalid_msg(archive_data, new_revision): new_revision = new_revision.to_dict() new_revision["message"] = b"elegant fix for bug \xff" archive_data.revision_add([Revision.from_dict(new_revision)]) revision = archive.lookup_revision(hash_to_hex(new_revision["id"])) assert revision["message"] == "elegant fix for bug \\xff" assert "message" in revision["decoding_failures"] @given(new_revision()) def test_lookup_revision_msg_ok(archive_data, new_revision): archive_data.revision_add([new_revision]) revision_message = archive.lookup_revision_message(hash_to_hex(new_revision.id)) assert revision_message == {"message": new_revision.message} def test_lookup_revision_msg_no_rev(): unknown_revision_ = random_sha1() with pytest.raises(NotFoundExc) as e: archive.lookup_revision_message(unknown_revision_) assert e.match("Revision with sha1_git %s not found." % unknown_revision_) def test_lookup_revision_multiple(archive_data, revisions): actual_revisions = list(archive.lookup_revision_multiple(revisions)) expected_revisions = [] for rev in revisions: expected_revisions.append(archive_data.revision_get(rev)) assert actual_revisions == expected_revisions def test_lookup_revision_multiple_none_found(): unknown_revisions_ = [random_sha1(), random_sha1(), random_sha1()] actual_revisions = list(archive.lookup_revision_multiple(unknown_revisions_)) assert actual_revisions == [None] * len(unknown_revisions_) def test_lookup_revision_log(archive_data, revision): actual_revision_log = list(archive.lookup_revision_log(revision, limit=25)) expected_revision_log = archive_data.revision_log(revision, limit=25) assert actual_revision_log == expected_revision_log def _get_origin_branches(archive_data, origin): origin_visit = archive_data.origin_visit_get(origin["url"])[-1] snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) branches = { k: v for (k, v) in snapshot["branches"].items() if v["target_type"] == "revision" } return branches def test_lookup_revision_log_by(archive_data, origin): branches = _get_origin_branches(archive_data, origin) branch_name = random.choice(list(branches.keys())) actual_log = list( archive.lookup_revision_log_by(origin["url"], branch_name, None, limit=25) ) expected_log = archive_data.revision_log(branches[branch_name]["target"], limit=25) assert actual_log == expected_log def test_lookup_revision_log_by_notfound(origin): with pytest.raises(NotFoundExc): archive.lookup_revision_log_by( origin["url"], "unknown_branch_name", None, limit=100 ) def test_lookup_content_raw_not_found(): unknown_content_ = random_content() with pytest.raises(NotFoundExc) as e: archive.lookup_content_raw("sha1:" + unknown_content_["sha1"]) assert e.match( "Content with %s checksum equals to %s not found!" % ("sha1", unknown_content_["sha1"]) ) def test_lookup_content_raw(archive_data, content): actual_content = archive.lookup_content_raw("sha256:%s" % content["sha256"]) expected_content = archive_data.content_get_data(content["sha1"]) assert actual_content == expected_content def test_lookup_empty_content_raw(empty_content): content_raw = archive.lookup_content_raw(f"sha1_git:{empty_content['sha1_git']}") assert content_raw["data"] == b"" def test_lookup_content_not_found(): unknown_content_ = random_content() with pytest.raises(NotFoundExc) as e: archive.lookup_content("sha1:%s" % unknown_content_["sha1"]) assert e.match( "Content with %s checksum equals to %s not found!" % ("sha1", unknown_content_["sha1"]) ) def test_lookup_content_with_sha1(archive_data, content): actual_content = archive.lookup_content(f"sha1:{content['sha1']}") expected_content = archive_data.content_get(content["sha1"]) assert actual_content == expected_content def test_lookup_content_with_sha256(archive_data, content): actual_content = archive.lookup_content(f"sha256:{content['sha256']}") expected_content = archive_data.content_get(content["sha1"]) assert actual_content == expected_content def test_lookup_directory_bad_checksum(): with pytest.raises(BadInputExc): archive.lookup_directory("directory_id") def test_lookup_directory_not_found(): unknown_directory_ = random_sha1() with pytest.raises(NotFoundExc) as e: archive.lookup_directory(unknown_directory_) assert e.match("Directory with sha1_git %s not found" % unknown_directory_) def test_lookup_directory(archive_data, directory): actual_directory_ls = list(archive.lookup_directory(directory)) expected_directory_ls = archive_data.directory_ls(directory) assert actual_directory_ls == expected_directory_ls def test_lookup_directory_empty(empty_directory): actual_directory_ls = list(archive.lookup_directory(empty_directory)) assert actual_directory_ls == [] def test_lookup_revision_by_nothing_found(origin): with pytest.raises(NotFoundExc): archive.lookup_revision_by(origin["url"], "invalid-branch-name") def test_lookup_revision_by(archive_data, origin): branches = _get_origin_branches(archive_data, origin) branch_name = random.choice(list(branches.keys())) actual_revision = archive.lookup_revision_by(origin["url"], branch_name) expected_revision = archive_data.revision_get(branches[branch_name]["target"]) assert actual_revision == expected_revision def test_lookup_revision_with_context_by_ko(origin, revision): with pytest.raises(NotFoundExc): archive.lookup_revision_with_context_by( origin["url"], "invalid-branch-name", None, revision ) def test_lookup_revision_with_context_by(archive_data, origin): branches = _get_origin_branches(archive_data, origin) branch_name = random.choice(list(branches.keys())) root_rev = branches[branch_name]["target"] root_rev_log = archive_data.revision_log(root_rev) children = defaultdict(list) for rev in root_rev_log: for rev_p in rev["parents"]: children[rev_p].append(rev["id"]) rev = root_rev_log[-1]["id"] actual_root_rev, actual_rev = archive.lookup_revision_with_context_by( origin["url"], branch_name, None, rev ) expected_root_rev = archive_data.revision_get(root_rev) expected_rev = archive_data.revision_get(rev) expected_rev["children"] = children[rev] assert actual_root_rev == expected_root_rev assert actual_rev == expected_rev def test_lookup_revision_through_ko_not_implemented(): with pytest.raises(NotImplementedError): archive.lookup_revision_through({"something-unknown": 10}) def test_lookup_revision_through_with_context_by(archive_data, origin): branches = _get_origin_branches(archive_data, origin) branch_name = random.choice(list(branches.keys())) root_rev = branches[branch_name]["target"] root_rev_log = archive_data.revision_log(root_rev) rev = root_rev_log[-1]["id"] assert archive.lookup_revision_through( { "origin_url": origin["url"], "branch_name": branch_name, "ts": None, "sha1_git": rev, } ) == archive.lookup_revision_with_context_by(origin["url"], branch_name, None, rev) def test_lookup_revision_through_with_revision_by(archive_data, origin): branches = _get_origin_branches(archive_data, origin) branch_name = random.choice(list(branches.keys())) assert archive.lookup_revision_through( - {"origin_url": origin["url"], "branch_name": branch_name, "ts": None,} + { + "origin_url": origin["url"], + "branch_name": branch_name, + "ts": None, + } ) == archive.lookup_revision_by(origin["url"], branch_name, None) def test_lookup_revision_through_with_context(ancestor_revisions): sha1_git = ancestor_revisions["sha1_git"] sha1_git_root = ancestor_revisions["sha1_git_root"] assert archive.lookup_revision_through( - {"sha1_git_root": sha1_git_root, "sha1_git": sha1_git,} + { + "sha1_git_root": sha1_git_root, + "sha1_git": sha1_git, + } ) == archive.lookup_revision_with_context(sha1_git_root, sha1_git) def test_lookup_revision_through_with_revision(revision): assert archive.lookup_revision_through( {"sha1_git": revision} ) == archive.lookup_revision(revision) def test_lookup_directory_through_revision_ko_not_found(revision): with pytest.raises(NotFoundExc): archive.lookup_directory_through_revision( {"sha1_git": revision}, "some/invalid/path" ) def test_lookup_directory_through_revision_ok(archive_data, revision): rev_data = archive_data.revision_get(revision) dir_entries = [ e for e in archive_data.directory_ls(rev_data["directory"]) if e["type"] == "file" ] dir_entry = random.choice(dir_entries) assert archive.lookup_directory_through_revision( {"sha1_git": revision}, dir_entry["name"] ) == (revision, archive.lookup_directory_with_revision(revision, dir_entry["name"])) def test_lookup_directory_through_revision_ok_with_data(archive_data, revision): rev_data = archive_data.revision_get(revision) dir_entries = [ e for e in archive_data.directory_ls(rev_data["directory"]) if e["type"] == "file" ] dir_entry = random.choice(dir_entries) assert archive.lookup_directory_through_revision( {"sha1_git": revision}, dir_entry["name"], with_data=True ) == ( revision, archive.lookup_directory_with_revision( revision, dir_entry["name"], with_data=True ), ) def test_lookup_known_objects( archive_data, content, directory, release, revision, snapshot ): expected = archive_data.content_find(content) assert archive.lookup_object(ObjectType.CONTENT, content["sha1_git"]) == expected expected = archive_data.directory_get(directory) assert archive.lookup_object(ObjectType.DIRECTORY, directory) == expected expected = archive_data.release_get(release) assert archive.lookup_object(ObjectType.RELEASE, release) == expected expected = archive_data.revision_get(revision) assert archive.lookup_object(ObjectType.REVISION, revision) == expected expected = {**archive_data.snapshot_get(snapshot), "next_branch": None} assert archive.lookup_object(ObjectType.SNAPSHOT, snapshot) == expected def test_lookup_unknown_objects( unknown_content, unknown_directory, unknown_release, unknown_revision, unknown_snapshot, ): with pytest.raises(NotFoundExc) as e: archive.lookup_object(ObjectType.CONTENT, unknown_content["sha1_git"]) assert e.match(r"Content.*not found") with pytest.raises(NotFoundExc) as e: archive.lookup_object(ObjectType.DIRECTORY, unknown_directory) assert e.match(r"Directory.*not found") with pytest.raises(NotFoundExc) as e: archive.lookup_object(ObjectType.RELEASE, unknown_release) assert e.match(r"Release.*not found") with pytest.raises(NotFoundExc) as e: archive.lookup_object(ObjectType.REVISION, unknown_revision) assert e.match(r"Revision.*not found") with pytest.raises(NotFoundExc) as e: archive.lookup_object(ObjectType.SNAPSHOT, unknown_snapshot) assert e.match(r"Snapshot.*not found") def test_lookup_invalid_objects(invalid_sha1): with pytest.raises(BadInputExc) as e: archive.lookup_object(ObjectType.CONTENT, invalid_sha1) assert e.match("Invalid hash") with pytest.raises(BadInputExc) as e: archive.lookup_object(ObjectType.DIRECTORY, invalid_sha1) assert e.match("Invalid checksum") with pytest.raises(BadInputExc) as e: archive.lookup_object(ObjectType.RELEASE, invalid_sha1) assert e.match("Invalid checksum") with pytest.raises(BadInputExc) as e: archive.lookup_object(ObjectType.REVISION, invalid_sha1) assert e.match("Invalid checksum") with pytest.raises(BadInputExc) as e: archive.lookup_object(ObjectType.SNAPSHOT, invalid_sha1) assert e.match("Invalid checksum") def test_lookup_missing_hashes_non_present(): missing_cnt = random_sha1() missing_dir = random_sha1() missing_rev = random_sha1() missing_rel = random_sha1() missing_snp = random_sha1() grouped_swhids = { ObjectType.CONTENT: [hash_to_bytes(missing_cnt)], ObjectType.DIRECTORY: [hash_to_bytes(missing_dir)], ObjectType.REVISION: [hash_to_bytes(missing_rev)], ObjectType.RELEASE: [hash_to_bytes(missing_rel)], ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)], } actual_result = archive.lookup_missing_hashes(grouped_swhids) assert actual_result == { missing_cnt, missing_dir, missing_rev, missing_rel, missing_snp, } def test_lookup_missing_hashes_some_present(content, directory): missing_rev = random_sha1() missing_rel = random_sha1() missing_snp = random_sha1() grouped_swhids = { ObjectType.CONTENT: [hash_to_bytes(content["sha1_git"])], ObjectType.DIRECTORY: [hash_to_bytes(directory)], ObjectType.REVISION: [hash_to_bytes(missing_rev)], ObjectType.RELEASE: [hash_to_bytes(missing_rel)], ObjectType.SNAPSHOT: [hash_to_bytes(missing_snp)], } actual_result = archive.lookup_missing_hashes(grouped_swhids) assert actual_result == {missing_rev, missing_rel, missing_snp} def test_lookup_origin_extra_trailing_slash(origin): origin_info = archive.lookup_origin({"url": f"{origin['url']}/"}) assert origin_info["url"] == origin["url"] def test_lookup_origin_missing_trailing_slash(archive_data): deb_origin = Origin(url="http://snapshot.debian.org/package/r-base/") archive_data.origin_add([deb_origin]) origin_info = archive.lookup_origin({"url": deb_origin.url[:-1]}) assert origin_info["url"] == deb_origin.url def test_lookup_origin_single_slash_after_protocol(archive_data): origin_url = "http://snapshot.debian.org/package/r-base/" malformed_origin_url = "http:/snapshot.debian.org/package/r-base/" archive_data.origin_add([Origin(url=origin_url)]) origin_info = archive.lookup_origin({"url": malformed_origin_url}) assert origin_info["url"] == origin_url @given(new_origin()) def test_lookup_origins_get_by_sha1s(origin, unknown_origin): hasher = hashlib.sha1() hasher.update(origin["url"].encode("utf-8")) origin_info = OriginInfo(url=origin["url"]) origin_sha1 = hasher.hexdigest() hasher = hashlib.sha1() hasher.update(unknown_origin.url.encode("utf-8")) unknown_origin_sha1 = hasher.hexdigest() origins = list(archive.lookup_origins_by_sha1s([origin_sha1])) assert origins == [origin_info] origins = list(archive.lookup_origins_by_sha1s([origin_sha1, origin_sha1])) assert origins == [origin_info, origin_info] origins = list(archive.lookup_origins_by_sha1s([origin_sha1, unknown_origin_sha1])) assert origins == [origin_info, None] def test_search_origin(origin): results = archive.search_origin(url_pattern=origin["url"])[0] assert results == [{"url": origin["url"]}] def test_search_origin_use_ql(mocker, origin): ORIGIN = [{"url": origin["url"]}] mock_archive_search = mocker.patch("swh.web.common.archive.search") mock_archive_search.origin_search.return_value = PagedResult( - results=ORIGIN, next_page_token=None, + results=ORIGIN, + next_page_token=None, ) query = f"origin = '{origin['url']}'" results = archive.search_origin(url_pattern=query, use_ql=True)[0] assert results == ORIGIN mock_archive_search.origin_search.assert_called_with( query=query, page_token=None, with_visit=False, visit_types=None, limit=50 ) def test_lookup_snapshot_sizes(archive_data, snapshot): branches = archive_data.snapshot_get(snapshot)["branches"] expected_sizes = { "alias": 0, "release": 0, "revision": 0, } for branch_name, branch_info in branches.items(): if branch_info is not None: expected_sizes[branch_info["target_type"]] += 1 assert archive.lookup_snapshot_sizes(snapshot) == expected_sizes def test_lookup_snapshot_sizes_with_filtering(archive_data, revision): rev_id = hash_to_bytes(revision) snapshot = Snapshot( branches={ b"refs/heads/master": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), b"refs/heads/incoming": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), b"refs/pull/1": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), b"refs/pull/2": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), }, ) archive_data.snapshot_add([snapshot]) expected_sizes = {"alias": 0, "release": 0, "revision": 2} assert ( archive.lookup_snapshot_sizes( snapshot.id.hex(), branch_name_exclude_prefix="refs/pull/" ) == expected_sizes ) def test_lookup_snapshot_alias(snapshot): resolved_alias = archive.lookup_snapshot_alias(snapshot, "HEAD") assert resolved_alias is not None assert resolved_alias["target_type"] == "revision" assert resolved_alias["target"] is not None def test_lookup_snapshot_missing(revision): with pytest.raises(NotFoundExc): archive.lookup_snapshot(revision) def test_lookup_snapshot_empty_branch_list(archive_data, revision): rev_id = hash_to_bytes(revision) snapshot = Snapshot( branches={ b"refs/heads/master": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), }, ) archive_data.snapshot_add([snapshot]) # FIXME; This test will change once the inconsistency in storage is fixed # postgres backend returns None in case of a missing branch whereas the # in-memory implementation (used in tests) returns a data structure; # hence the inconsistency branches = archive.lookup_snapshot( - hash_to_hex(snapshot.id), branch_name_include_substring="non-existing", + hash_to_hex(snapshot.id), + branch_name_include_substring="non-existing", )["branches"] assert not branches def test_lookup_snapshot_branch_names_filtering(archive_data, revision): rev_id = hash_to_bytes(revision) snapshot = Snapshot( branches={ b"refs/heads/master": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), b"refs/heads/incoming": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), b"refs/pull/1": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), b"refs/pull/2": SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), "non_ascii_name_é".encode(): SnapshotBranch( - target=rev_id, target_type=TargetType.REVISION, + target=rev_id, + target_type=TargetType.REVISION, ), }, ) archive_data.snapshot_add([snapshot]) for include_pattern, exclude_prefix, nb_results in ( ("pull", None, 2), ("incoming", None, 1), ("é", None, 1), (None, "refs/heads/", 3), ("refs", "refs/heads/master", 3), ): branches = archive.lookup_snapshot( hash_to_hex(snapshot.id), branch_name_include_substring=include_pattern, branch_name_exclude_prefix=exclude_prefix, )["branches"] assert len(branches) == nb_results for branch_name in branches: if include_pattern: assert include_pattern in branch_name if exclude_prefix: assert not branch_name.startswith(exclude_prefix) def test_lookup_snapshot_branch_names_filtering_paginated( archive_data, directory, revision ): pattern = "foo" nb_branches_by_target_type = 10 branches = {} for i in range(nb_branches_by_target_type): branches[f"branch/directory/bar{i}".encode()] = SnapshotBranch( - target=hash_to_bytes(directory), target_type=TargetType.DIRECTORY, + target=hash_to_bytes(directory), + target_type=TargetType.DIRECTORY, ) branches[f"branch/revision/bar{i}".encode()] = SnapshotBranch( - target=hash_to_bytes(revision), target_type=TargetType.REVISION, + target=hash_to_bytes(revision), + target_type=TargetType.REVISION, ) branches[f"branch/directory/{pattern}{i}".encode()] = SnapshotBranch( - target=hash_to_bytes(directory), target_type=TargetType.DIRECTORY, + target=hash_to_bytes(directory), + target_type=TargetType.DIRECTORY, ) branches[f"branch/revision/{pattern}{i}".encode()] = SnapshotBranch( - target=hash_to_bytes(revision), target_type=TargetType.REVISION, + target=hash_to_bytes(revision), + target_type=TargetType.REVISION, ) snapshot = Snapshot(branches=branches) archive_data.snapshot_add([snapshot]) branches_count = nb_branches_by_target_type // 2 for target_type in ( ObjectType.DIRECTORY.name.lower(), ObjectType.REVISION.name.lower(), ): partial_branches = archive.lookup_snapshot( hash_to_hex(snapshot.id), target_types=[target_type], branches_count=branches_count, branch_name_include_substring=pattern, ) branches = partial_branches["branches"] assert len(branches) == branches_count for branch_name, branch_data in branches.items(): assert pattern in branch_name assert branch_data["target_type"] == target_type for i in range(branches_count): assert f"branch/{target_type}/{pattern}{i}" in branches assert ( partial_branches["next_branch"] == f"branch/{target_type}/{pattern}{branches_count}" ) partial_branches = archive.lookup_snapshot( hash_to_hex(snapshot.id), target_types=[target_type], branches_from=partial_branches["next_branch"], branch_name_include_substring=pattern, ) branches = partial_branches["branches"] assert len(branches) == branches_count for branch_name, branch_data in branches.items(): assert pattern in branch_name assert branch_data["target_type"] == target_type for i in range(branches_count, 2 * branches_count): assert f"branch/{target_type}/{pattern}{i}" in branches assert partial_branches["next_branch"] is None diff --git a/swh/web/tests/common/test_converters.py b/swh/web/tests/common/test_converters.py index c00062bb..3af828cc 100644 --- a/swh/web/tests/common/test_converters.py +++ b/swh/web/tests/common/test_converters.py @@ -1,740 +1,752 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib from swh.model import hashutil from swh.model.model import ( ObjectType, Person, Release, Revision, RevisionType, TimestampWithTimezone, ) from swh.web.common import converters def test_fmap(): assert [2, 3, None, 4] == converters.fmap(lambda x: x + 1, [1, 2, None, 3]) assert [11, 12, 13] == list( converters.fmap(lambda x: x + 10, map(lambda x: x, [1, 2, 3])) ) assert {"a": 2, "b": 4} == converters.fmap(lambda x: x * 2, {"a": 1, "b": 2}) assert 100 == converters.fmap(lambda x: x * 10, 10) assert {"a": [2, 6], "b": 4} == converters.fmap( lambda x: x * 2, {"a": [1, 3], "b": 2} ) assert converters.fmap(lambda x: x, None) is None def test_from_swh(): some_input = { "a": "something", "b": "someone", "c": b"sharp-0.3.4.tgz", "d": hashutil.hash_to_bytes("b04caf10e9535160d90e874b45aa426de762f19f"), "e": b"sharp.html/doc_002dS_005fISREG.html", "g": [b"utf-8-to-decode", b"another-one"], "h": "something filtered", "i": {"e": b"something"}, "j": { "k": { "l": [b"bytes thing", b"another thingy", b""], "n": "don't care either", }, "m": "don't care", }, "o": "something", "p": b"foo", "q": {"extra-headers": [["a", b"intact"]]}, "w": None, "r": {"p": "also intact", "q": "bar"}, - "s": {"timestamp": 42, "offset": -420, "negative_utc": None,}, + "s": { + "timestamp": 42, + "offset": -420, + "negative_utc": None, + }, "s1": { "timestamp": {"seconds": 42, "microseconds": 0}, "offset": -420, "negative_utc": None, }, "s2": datetime.datetime(2013, 7, 1, 20, 0, 0, tzinfo=datetime.timezone.utc), "t": None, "u": None, "v": None, "x": None, } expected_output = { "a": "something", "b": "someone", "c": "sharp-0.3.4.tgz", "d": "b04caf10e9535160d90e874b45aa426de762f19f", "e": "sharp.html/doc_002dS_005fISREG.html", "g": ["utf-8-to-decode", "another-one"], "i": {"e": "something"}, "j": {"k": {"l": ["bytes thing", "another thingy", ""]}}, "p": "foo", "q": {"extra-headers": [["a", "intact"]]}, "w": {}, "r": {"p": "also intact", "q": "bar"}, "s": "1969-12-31T17:00:42-07:00", "s1": "1969-12-31T17:00:42-07:00", "s2": "2013-07-01T20:00:00+00:00", "u": {}, "v": [], "x": None, } actual_output = converters.from_swh( some_input, hashess={"d", "o", "x"}, bytess={"c", "e", "g", "l"}, dates={"s", "s1", "s2"}, blacklist={"h", "m", "n", "o"}, removables_if_empty={"t"}, empty_dict={"u"}, empty_list={"v"}, convert={"p", "q", "w"}, convert_fn=converters.convert_metadata, ) assert expected_output == actual_output def test_from_swh_edge_cases_do_no_conversion_if_none_or_not_bytes(): some_input = {"a": "something", "b": None, "c": "someone", "d": None, "e": None} expected_output = { "a": "something", "b": None, "c": "someone", "d": None, "e": None, } actual_output = converters.from_swh( some_input, hashess={"a", "b"}, bytess={"c", "d"}, dates={"e"} ) assert expected_output == actual_output def test_from_swh_edge_cases_convert_invalid_utf8_bytes(): some_input = { "a": "something", "b": "someone", "c": b"a name \xff", "d": b"an email \xff", } expected_output = { "a": "something", "b": "someone", "c": "a name \\xff", "d": "an email \\xff", "decoding_failures": ["c", "d"], } actual_output = converters.from_swh( some_input, hashess={"a", "b"}, bytess={"c", "d"} ) for v in ["a", "b", "c", "d"]: assert expected_output[v] == actual_output[v] assert len(expected_output["decoding_failures"]) == len( actual_output["decoding_failures"] ) for v in expected_output["decoding_failures"]: assert v in actual_output["decoding_failures"] def test_from_swh_empty(): assert {} == converters.from_swh({}) def test_from_swh_none(): assert converters.from_swh(None) is None def test_from_origin(): origin_url = "rsync://ftp.gnu.org/gnu/octave" origin_input = { "id": hashlib.sha1(origin_url.encode("utf-8")).digest(), "url": origin_url, } expected_origin = { "url": origin_url, } actual_origin = converters.from_origin(origin_input) assert actual_origin == expected_origin def test_from_origin_visit(): snap_hash = "b5f0b7f716735ebffe38505c60145c4fd9da6ca3" for snap in [snap_hash, None]: visit = { "date": { "timestamp": datetime.datetime( 2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "origin": 10, "visit": 100, "metadata": None, "status": "full", "snapshot": hashutil.hash_to_bytes(snap) if snap else snap, } expected_visit = { "date": "2015-01-01T22:00:00+00:00", "origin": 10, "visit": 100, "metadata": {}, "status": "full", "snapshot": snap_hash if snap else snap, } actual_visit = converters.from_origin_visit(visit) assert actual_visit == expected_visit def test_from_release(): """Convert release model object to a dict should be ok""" release_input = Release( id=hashutil.hash_to_bytes("aad23fa492a0c5fed0708a6703be875448c86884"), target=hashutil.hash_to_bytes("5e46d564378afc44b31bb89f99d5675195fbdf67"), target_type=ObjectType.REVISION, date=TimestampWithTimezone.from_datetime( datetime.datetime(2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc) ), author=Person( name=b"author name", fullname=b"Author Name author@email", email=b"author@email", ), name=b"v0.0.1", message=b"some comment on release", synthetic=True, ) expected_release = { "id": "aad23fa492a0c5fed0708a6703be875448c86884", "target": "5e46d564378afc44b31bb89f99d5675195fbdf67", "target_type": "revision", "date": "2015-01-01T22:00:00+00:00", "author": { "name": "author name", "fullname": "Author Name author@email", "email": "author@email", }, "name": "v0.0.1", "message": "some comment on release", "target_type": "revision", "synthetic": True, } actual_release = converters.from_release(release_input) assert actual_release == expected_release def test_from_revision_model_object(): date = TimestampWithTimezone.from_datetime( datetime.datetime(2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc) ) revision_input = Revision( directory=hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), author=Person( name=b"Software Heritage", fullname=b"robot robot@softwareheritage.org", email=b"robot@softwareheritage.org", ), committer=Person( name=b"Software Heritage", fullname=b"robot robot@softwareheritage.org", email=b"robot@softwareheritage.org", ), message=b"synthetic revision message", date=date, committer_date=date, synthetic=True, type=RevisionType.TAR, parents=tuple( [ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ] ), extra_headers=((b"gpgsig", b"some-signature"),), metadata={ "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, ) expected_revision = { "id": "a001358278a0d811fe7072463f805da601121c2a", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "parents": tuple( [ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ] ), "type": "tar", "synthetic": True, "extra_headers": (("gpgsig", "some-signature"),), "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision(): ts = datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp() revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "directory": hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), "author": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "committer": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "message": b"synthetic revision message", - "date": {"timestamp": ts, "offset": 0, "negative_utc": False,}, - "committer_date": {"timestamp": ts, "offset": 0, "negative_utc": False,}, + "date": { + "timestamp": ts, + "offset": 0, + "negative_utc": False, + }, + "committer_date": { + "timestamp": ts, + "offset": 0, + "negative_utc": False, + }, "synthetic": True, "type": "tar", "parents": [ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ], "children": [ hashutil.hash_to_bytes("123546353ed3480476f032475e7c244eff7371d5"), ], "metadata": { "extra_headers": [["gpgsig", b"some-signature"]], "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], "parents": [ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ], "type": "tar", "synthetic": True, "metadata": { "extra_headers": [["gpgsig", "some-signature"]], "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ], }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision_nomerge(): revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "parents": [hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5")], } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "parents": ["29d8be353ed3480476f032475e7c244eff7371d5"], "merge": False, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision_noparents(): revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "directory": hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), "author": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "committer": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "message": b"synthetic revision message", "date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "committer_date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "synthetic": True, "type": "tar", "children": [ hashutil.hash_to_bytes("123546353ed3480476f032475e7c244eff7371d5"), ], "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], "type": "tar", "synthetic": True, "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_revision_invalid(): revision_input = { "id": hashutil.hash_to_bytes("18d8be353ed3480476f032475e7c233eff7371d5"), "directory": hashutil.hash_to_bytes("7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), "author": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "committer": { "name": b"Software Heritage", "fullname": b"robot robot@softwareheritage.org", "email": b"robot@softwareheritage.org", }, "message": b"invalid message \xff", "date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "committer_date": { "timestamp": datetime.datetime( 2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc ).timestamp(), "offset": 0, "negative_utc": False, }, "synthetic": True, "type": "tar", "parents": [ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ], "children": [ hashutil.hash_to_bytes("123546353ed3480476f032475e7c244eff7371d5"), ], "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, } expected_revision = { "id": "18d8be353ed3480476f032475e7c233eff7371d5", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "committer": { "name": "Software Heritage", "fullname": "robot robot@softwareheritage.org", "email": "robot@softwareheritage.org", }, "message": "invalid message \\xff", "decoding_failures": ["message"], "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "children": ["123546353ed3480476f032475e7c244eff7371d5"], "parents": [ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ], "type": "tar", "synthetic": True, "metadata": { "original_artifact": [ { "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", } ] }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision def test_from_content_none(): assert converters.from_content(None) is None def test_from_content(): content_input = { "sha1": hashutil.hash_to_bytes("5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5"), "sha256": hashutil.hash_to_bytes( "39007420ca5de7cb3cfc15196335507e" "e76c98930e7e0afa4d2747d3bf96c926" ), "blake2s256": hashutil.hash_to_bytes( "49007420ca5de7cb3cfc15196335507e" "e76c98930e7e0afa4d2747d3bf96c926" ), "sha1_git": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "ctime": "something-which-is-filtered-out", "data": b"data in bytes", "length": 10, "status": "hidden", } # 'status' is filtered expected_content = { "checksums": { "sha1": "5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5", "sha256": "39007420ca5de7cb3cfc15196335507ee76c98" "930e7e0afa4d2747d3bf96c926", "blake2s256": "49007420ca5de7cb3cfc15196335507ee7" "6c98930e7e0afa4d2747d3bf96c926", "sha1_git": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", }, "data": b"data in bytes", "length": 10, "status": "absent", } actual_content = converters.from_content(content_input) assert actual_content == expected_content def test_from_person(): person_input = { "id": 10, "anything": "else", "name": b"bob", "fullname": b"bob bob@alice.net", "email": b"bob@foo.alice", } expected_person = { "id": 10, "anything": "else", "name": "bob", "fullname": "bob bob@alice.net", "email": "bob@foo.alice", } actual_person = converters.from_person(person_input) assert actual_person == expected_person def test_from_directory_entries(): dir_entries_input = { "sha1": hashutil.hash_to_bytes("5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5"), "sha256": hashutil.hash_to_bytes( "39007420ca5de7cb3cfc15196335507e" "e76c98930e7e0afa4d2747d3bf96c926" ), "sha1_git": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "blake2s256": hashutil.hash_to_bytes( "685395c5dc57cada459364f0946d3dd45bad5fcbab" "c1048edb44380f1d31d0aa" ), "target": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "dir_id": hashutil.hash_to_bytes("40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03"), "name": b"bob", "type": 10, "status": "hidden", } expected_dir_entries = { "checksums": { "sha1": "5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5", "sha256": "39007420ca5de7cb3cfc15196335507ee76c98" "930e7e0afa4d2747d3bf96c926", "sha1_git": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", "blake2s256": "685395c5dc57cada459364f0946d3dd45bad5f" "cbabc1048edb44380f1d31d0aa", }, "target": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", "dir_id": "40e71b8614fcd89ccd17ca2b1d9e66c5b00a6d03", "name": "bob", "type": 10, "status": "absent", } actual_dir_entries = converters.from_directory_entry(dir_entries_input) assert actual_dir_entries == expected_dir_entries def test_from_filetype(): content_filetype = { "id": hashutil.hash_to_bytes("5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5"), "encoding": "utf-8", "mimetype": "text/plain", } expected_content_filetype = { "id": "5c6f0e2750f48fa0bd0c4cf5976ba0b9e02ebda5", "encoding": "utf-8", "mimetype": "text/plain", } actual_content_filetype = converters.from_filetype(content_filetype) assert actual_content_filetype == expected_content_filetype diff --git a/swh/web/tests/common/test_django_command.py b/swh/web/tests/common/test_django_command.py index 2c975210..aa9d1e18 100644 --- a/swh/web/tests/common/test_django_command.py +++ b/swh/web/tests/common/test_django_command.py @@ -1,178 +1,174 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from io import StringIO import pytest from django.core.management import call_command from swh.core.api.classes import stream_results from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_TASK_FAILED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_FAILED, VISIT_STATUS_FULL, VISIT_STATUS_PARTIAL, ) from swh.web.common.typing import SaveOriginRequestInfo from swh.web.config import get_config MODULE_FQDN = "swh.web.common.management.commands" COMMAND_NAME = "refresh_savecodenow_statuses" AUTHORIZED_ORIGIN_URL = "https://scm.ourproject.org/anonscm/%s" @pytest.fixture def mock_refresh(mocker): return mocker.patch( f"{MODULE_FQDN}.{COMMAND_NAME}.refresh_save_origin_request_statuses" ) @pytest.fixture def mock_scheduler(mocker, swh_scheduler): mock_scheduler = mocker.patch(f"{MODULE_FQDN}.{COMMAND_NAME}.get_scheduler") mock_scheduler.return_value = swh_scheduler return mock_scheduler @pytest.mark.parametrize("nb_results", [0, 10, 20]) def test_command_refresh__with_statuses_refreshed( mock_scheduler, mock_refresh, nb_results ): - """Refresh status command reports non-terminal statuses updates. - - """ + """Refresh status command reports non-terminal statuses updates.""" # fake returned refreshed status for 'archives' visit type - mock_refresh.return_value = [{"visit_type": "archives",}] * nb_results + mock_refresh.return_value = [ + { + "visit_type": "archives", + } + ] * nb_results out = StringIO() call_command(COMMAND_NAME, stdout=out) actual_output = out.getvalue() if nb_results > 0: assert f"updated {nb_results}" in actual_output else: assert "Nothing" in actual_output assert mock_scheduler.called assert mock_refresh.called @pytest.fixture def fake_refreshed_data(): - """Prepare test data within the scheduler and the swh-web model db - - """ + """Prepare test data within the scheduler and the swh-web model db""" duplicated_origin_url = AUTHORIZED_ORIGIN_URL % "specific-origin" - entries = ( - [ - { - "visit_type": "archives", # ignored from recurring task scheduling - "visit_status": VISIT_STATUS_FULL, - "task_status": SAVE_TASK_SUCCEEDED, - }, - { - "visit_type": "hg", # scheduled as recurring task - "visit_status": VISIT_STATUS_PARTIAL, - "task_status": SAVE_TASK_SUCCEEDED, - }, - { - "visit_type": "svn", # scheduled as recurring task - "visit_status": VISIT_STATUS_PARTIAL, - "task_status": SAVE_TASK_SCHEDULED, - }, - { - "visit_type": "svn", # ignored from recurring task scheduling - "visit_status": VISIT_STATUS_FAILED, - "task_status": SAVE_TASK_FAILED, - }, - { - "visit_type": "hg", # ignored from recurring task scheduling - "visit_status": "created", - "task_status": SAVE_TASK_SCHEDULED, - }, - ] - + [ - { - "visit_type": "git", - "visit_status": VISIT_STATUS_FULL, - "task_status": SAVE_TASK_SUCCEEDED, - "origin": duplicated_origin_url, - } - ] - * 3 - ) # only 1 of the origin duplicates will be scheduled as recurring task + entries = [ + { + "visit_type": "archives", # ignored from recurring task scheduling + "visit_status": VISIT_STATUS_FULL, + "task_status": SAVE_TASK_SUCCEEDED, + }, + { + "visit_type": "hg", # scheduled as recurring task + "visit_status": VISIT_STATUS_PARTIAL, + "task_status": SAVE_TASK_SUCCEEDED, + }, + { + "visit_type": "svn", # scheduled as recurring task + "visit_status": VISIT_STATUS_PARTIAL, + "task_status": SAVE_TASK_SCHEDULED, + }, + { + "visit_type": "svn", # ignored from recurring task scheduling + "visit_status": VISIT_STATUS_FAILED, + "task_status": SAVE_TASK_FAILED, + }, + { + "visit_type": "hg", # ignored from recurring task scheduling + "visit_status": "created", + "task_status": SAVE_TASK_SCHEDULED, + }, + ] + [ + { + "visit_type": "git", + "visit_status": VISIT_STATUS_FULL, + "task_status": SAVE_TASK_SUCCEEDED, + "origin": duplicated_origin_url, + } + ] * 3 # only 1 of the origin duplicates will be scheduled as recurring task time_now = datetime.now(tz=timezone.utc) - timedelta(days=len(entries)) return [ SaveOriginRequestInfo( visit_type=meta["visit_type"], visit_status=meta["visit_status"], origin_url=( meta["origin"] if "origin" in meta else AUTHORIZED_ORIGIN_URL % i ), save_request_date=time_now + timedelta(days=i - 1), save_request_status=SAVE_REQUEST_ACCEPTED, visit_date=time_now + timedelta(days=i), save_task_status=meta["task_status"], id=i, loading_task_id=i, note=None, ) for i, meta in enumerate(entries) ] def test_command_refresh__with_recurrent_tasks_scheduling( mock_scheduler, mock_refresh, fake_refreshed_data, swh_scheduler ): """Refresh status command report updates of statuses. The successful ones without the - type 'archived' are also scheduled recurringly. + type 'archived' are also scheduled recurringly. """ mock_refresh.return_value = fake_refreshed_data # only visit types (git, hg, svn) types with status (full, partial) are taken into # account for scheduling, so only 3 of those matches in the fake data set. expected_nb_scheduled = 0 origins = set() expected_nb_scheduled = 0 for entry in fake_refreshed_data: visit_type = entry["visit_type"] if visit_type == "archives": # only deal with git, svn, hg continue if entry["visit_status"] not in ("partial", "full"): continue origin = entry["origin_url"] if (visit_type, origin) in origins: continue origins.add((visit_type, origin)) expected_nb_scheduled += 1 assert expected_nb_scheduled == 3 out = StringIO() call_command(COMMAND_NAME, stdout=out) actual_output = out.getvalue() assert f"Successfully updated {len(fake_refreshed_data)}" in actual_output lister = swh_scheduler.get_or_create_lister( name="save-code-now", instance_name=get_config()["instance_name"] ) result = list(stream_results(swh_scheduler.get_listed_origins, lister.id)) assert len(result) == expected_nb_scheduled assert mock_scheduler.called assert mock_refresh.called diff --git a/swh/web/tests/common/test_identifiers.py b/swh/web/tests/common/test_identifiers.py index 030f7df8..3a49eec6 100644 --- a/swh/web/tests/common/test_identifiers.py +++ b/swh/web/tests/common/test_identifiers.py @@ -1,761 +1,765 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random from urllib.parse import quote import pytest from swh.model.hashutil import hash_to_bytes from swh.model.model import Origin from swh.model.swhids import ObjectType, QualifiedSWHID from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.common.exc import BadInputExc from swh.web.common.identifiers import ( gen_swhid, get_swhid, get_swhids_info, group_swhids, parse_object_type, resolve_swhid, ) from swh.web.common.typing import SWHObjectInfo from swh.web.common.utils import reverse from swh.web.tests.data import random_sha1 def test_gen_swhid(content): swh_object_type = ObjectType.CONTENT sha1_git = content["sha1_git"] expected_swhid = "swh:1:cnt:" + sha1_git assert gen_swhid(swh_object_type, sha1_git) == expected_swhid assert ( gen_swhid(swh_object_type, sha1_git, metadata={"origin": "test"}) == expected_swhid + ";origin=test" ) assert ( gen_swhid(swh_object_type, sha1_git, metadata={"origin": None}) == expected_swhid ) with pytest.raises(BadInputExc) as e: gen_swhid(swh_object_type, "not a valid id") assert e.match("Invalid object") def test_parse_object_type(): assert parse_object_type("content") == ObjectType.CONTENT assert parse_object_type("directory") == ObjectType.DIRECTORY assert parse_object_type("revision") == ObjectType.REVISION assert parse_object_type("release") == ObjectType.RELEASE assert parse_object_type("snapshot") == ObjectType.SNAPSHOT with pytest.raises(BadInputExc) as e: parse_object_type("foo") assert e.match("Invalid swh object type") def test_resolve_swhid_legacy(content, directory, release, revision, snapshot): for obj_type, obj_id in ( (ObjectType.CONTENT, content["sha1_git"]), (ObjectType.DIRECTORY, directory), (ObjectType.RELEASE, release), (ObjectType.REVISION, revision), (ObjectType.SNAPSHOT, snapshot), ): swhid = gen_swhid(obj_type, obj_id) url_args = {} if obj_type == ObjectType.CONTENT: url_args["query_string"] = f"sha1_git:{obj_id}" elif obj_type == ObjectType.SNAPSHOT: url_args["snapshot_id"] = obj_id else: url_args["sha1_git"] = obj_id query_params = {"origin_url": "some-origin"} browse_url = reverse( f"browse-{obj_type.name.lower()}", url_args=url_args, query_params=query_params, ) for swhid_ in (swhid, swhid.upper()): resolved_swhid = resolve_swhid(swhid_, query_params) assert isinstance(resolved_swhid["swhid_parsed"], QualifiedSWHID) assert str(resolved_swhid["swhid_parsed"]) == swhid assert resolved_swhid["browse_url"] == browse_url with pytest.raises(BadInputExc, match="'ori' is not a valid ObjectType"): resolve_swhid(f"swh:1:ori:{random_sha1()}") def test_get_swhid(content, directory, release, revision, snapshot): for obj_type, obj_id in ( (ObjectType.CONTENT, content["sha1_git"]), (ObjectType.DIRECTORY, directory), (ObjectType.RELEASE, release), (ObjectType.REVISION, revision), (ObjectType.SNAPSHOT, snapshot), ): swhid = gen_swhid(obj_type, obj_id) for swhid_ in (swhid, swhid.upper()): swh_parsed_swhid = get_swhid(swhid_) assert isinstance(swh_parsed_swhid, QualifiedSWHID) assert str(swh_parsed_swhid) == swhid.lower() with pytest.raises(BadInputExc, match="Error when parsing identifier"): get_swhid("foo") def test_group_swhids(content, directory, release, revision, snapshot): swhids = [] expected = {} for obj_type, obj_id in ( (ObjectType.CONTENT, content["sha1_git"]), (ObjectType.DIRECTORY, directory), (ObjectType.RELEASE, release), (ObjectType.REVISION, revision), (ObjectType.SNAPSHOT, snapshot), ): swhid = gen_swhid(obj_type, obj_id) swhid = get_swhid(swhid) swhids.append(swhid) expected[obj_type] = [hash_to_bytes(obj_id)] swhid_groups = group_swhids(swhids) assert swhid_groups == expected def test_get_swhids_info_directory_context(archive_data, directory_with_subdirs): swhid = get_swhids_info( [ SWHObjectInfo( object_type=ObjectType.DIRECTORY, object_id=directory_with_subdirs ) ], snapshot_context=None, )[0] assert swhid["swhid_with_context"] is None # path qualifier should be discarded for a root directory swhid = get_swhids_info( [ SWHObjectInfo( object_type=ObjectType.DIRECTORY, object_id=directory_with_subdirs ) ], snapshot_context=None, extra_context={"path": "/"}, )[0] assert swhid["swhid_with_context"] is None dir_content = archive_data.directory_ls(directory_with_subdirs) dir_subdirs = [e for e in dir_content if e["type"] == "dir"] dir_subdir = random.choice(dir_subdirs) dir_subdir_path = f'/{dir_subdir["name"]}/' dir_subdir_content = archive_data.directory_ls(dir_subdir["target"]) dir_subdir_files = [e for e in dir_subdir_content if e["type"] == "file"] swh_objects_info = [ SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=dir_subdir["target"]) ] extra_context = { "root_directory": directory_with_subdirs, "path": dir_subdir_path, } if dir_subdir_files: dir_subdir_file = random.choice(dir_subdir_files) extra_context["filename"] = dir_subdir_file["name"] swh_objects_info.append( SWHObjectInfo( object_type=ObjectType.CONTENT, object_id=dir_subdir_file["checksums"]["sha1_git"], ) ) swhids = get_swhids_info( - swh_objects_info, snapshot_context=None, extra_context=extra_context, + swh_objects_info, + snapshot_context=None, + extra_context=extra_context, ) swhid_lower = swhids[0]["swhid_with_context"] swhid_upper = swhid_lower.replace(swhids[0]["swhid"], swhids[0]["swhid"].upper()) for swhid in (swhid_lower, swhid_upper): swhid_dir_parsed = get_swhid(swhid) anchor = gen_swhid(ObjectType.DIRECTORY, directory_with_subdirs) assert swhid_dir_parsed.qualifiers() == { "anchor": anchor, "path": dir_subdir_path, } if dir_subdir_files: swhid_cnt_parsed = get_swhid(swhids[1]["swhid_with_context"]) assert swhid_cnt_parsed.qualifiers() == { "anchor": anchor, "path": f'{dir_subdir_path}{dir_subdir_file["name"]}', } def test_get_swhids_info_revision_context(archive_data, revision): revision_data = archive_data.revision_get(revision) directory = revision_data["directory"] dir_content = archive_data.directory_ls(directory) dir_entry = random.choice(dir_content) swh_objects = [ SWHObjectInfo(object_type=ObjectType.REVISION, object_id=revision), SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=directory), ] extra_context = {"revision": revision, "path": "/"} if dir_entry["type"] == "file": swh_objects.append( SWHObjectInfo( object_type=ObjectType.CONTENT, object_id=dir_entry["checksums"]["sha1_git"], ) ) extra_context["filename"] = dir_entry["name"] swhids = get_swhids_info( - swh_objects, snapshot_context=None, extra_context=extra_context, + swh_objects, + snapshot_context=None, + extra_context=extra_context, ) assert swhids[0]["context"] == {} swhid_lower = swhids[1]["swhid_with_context"] swhid_upper = swhid_lower.replace(swhids[1]["swhid"], swhids[1]["swhid"].upper()) for swhid in (swhid_lower, swhid_upper): swhid_dir_parsed = get_swhid(swhid) anchor = gen_swhid(ObjectType.REVISION, revision) assert swhid_dir_parsed.qualifiers() == { "anchor": anchor, } if dir_entry["type"] == "file": swhid_cnt_parsed = get_swhid(swhids[2]["swhid_with_context"]) assert swhid_cnt_parsed.qualifiers() == { "anchor": anchor, "path": f'/{dir_entry["name"]}', } def test_get_swhids_info_origin_snapshot_context( archive_data, origin_with_multiple_visits ): """ Test SWHIDs with contextual info computation under a variety of origin / snapshot browsing contexts. """ origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) for visit in visits: snapshot = archive_data.snapshot_get(visit["snapshot"]) snapshot_id = snapshot["id"] branches = { k: v["target"] for k, v in snapshot["branches"].items() if v["target_type"] == "revision" } releases = { k: v["target"] for k, v in snapshot["branches"].items() if v["target_type"] == "release" } head_rev_id = archive_data.snapshot_get_head(snapshot) head_rev = archive_data.revision_get(head_rev_id) root_dir = head_rev["directory"] dir_content = archive_data.directory_ls(root_dir) dir_files = [e for e in dir_content if e["type"] == "file"] dir_file = random.choice(dir_files) revision_log = [r["id"] for r in archive_data.revision_log(head_rev_id)] branch_name = random.choice(list(branches)) release = random.choice(list(releases)) release_data = archive_data.release_get(releases[release]) release_name = release_data["name"] revision_id = random.choice(revision_log) for snp_ctx_params, anchor_info in ( ( {"snapshot_id": snapshot_id}, {"anchor_type": ObjectType.REVISION, "anchor_id": head_rev_id}, ), ( {"snapshot_id": snapshot_id, "branch_name": branch_name}, { "anchor_type": ObjectType.REVISION, "anchor_id": branches[branch_name], }, ), ( {"snapshot_id": snapshot_id, "release_name": release_name}, {"anchor_type": ObjectType.RELEASE, "anchor_id": releases[release]}, ), ( {"snapshot_id": snapshot_id, "revision_id": revision_id}, {"anchor_type": ObjectType.REVISION, "anchor_id": revision_id}, ), ( {"origin_url": origin_url, "snapshot_id": snapshot_id}, {"anchor_type": ObjectType.REVISION, "anchor_id": head_rev_id}, ), ( { "origin_url": origin_url, "snapshot_id": snapshot_id, "branch_name": branch_name, }, { "anchor_type": ObjectType.REVISION, "anchor_id": branches[branch_name], }, ), ( { "origin_url": origin_url, "snapshot_id": snapshot_id, "release_name": release_name, }, {"anchor_type": ObjectType.RELEASE, "anchor_id": releases[release]}, ), ( { "origin_url": origin_url, "snapshot_id": snapshot_id, "revision_id": revision_id, }, {"anchor_type": ObjectType.REVISION, "anchor_id": revision_id}, ), ): snapshot_context = get_snapshot_context(**snp_ctx_params) rev_id = head_rev_id if "branch_name" in snp_ctx_params: rev_id = branches[branch_name] elif "release_name" in snp_ctx_params: rev_id = release_data["target"] elif "revision_id" in snp_ctx_params: rev_id = revision_id swh_objects = [ SWHObjectInfo( object_type=ObjectType.CONTENT, object_id=dir_file["checksums"]["sha1_git"], ), SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=root_dir), SWHObjectInfo(object_type=ObjectType.REVISION, object_id=rev_id), SWHObjectInfo(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), ] if "release_name" in snp_ctx_params: swh_objects.append( SWHObjectInfo( object_type=ObjectType.RELEASE, object_id=release_data["id"] ) ) swhids = get_swhids_info( swh_objects, snapshot_context, extra_context={"path": "/", "filename": dir_file["name"]}, ) swhid_cnt_parsed = get_swhid(swhids[0]["swhid_with_context"]) swhid_dir_parsed = get_swhid(swhids[1]["swhid_with_context"]) swhid_rev_parsed = get_swhid(swhids[2]["swhid_with_context"]) swhid_snp_parsed = get_swhid( swhids[3]["swhid_with_context"] or swhids[3]["swhid"] ) swhid_rel_parsed = None if "release_name" in snp_ctx_params: swhid_rel_parsed = get_swhid(swhids[4]["swhid_with_context"]) anchor = gen_swhid( object_type=anchor_info["anchor_type"], object_id=anchor_info["anchor_id"], ) snapshot_swhid = gen_swhid( object_type=ObjectType.SNAPSHOT, object_id=snapshot_id ) expected_cnt_context = { "visit": snapshot_swhid, "anchor": anchor, "path": f'/{dir_file["name"]}', } expected_dir_context = { "visit": snapshot_swhid, "anchor": anchor, } expected_rev_context = {"visit": snapshot_swhid} expected_snp_context = {} if "origin_url" in snp_ctx_params: expected_cnt_context["origin"] = origin_url expected_dir_context["origin"] = origin_url expected_rev_context["origin"] = origin_url expected_snp_context["origin"] = origin_url assert swhid_cnt_parsed.qualifiers() == expected_cnt_context assert swhid_dir_parsed.qualifiers() == expected_dir_context assert swhid_rev_parsed.qualifiers() == expected_rev_context assert swhid_snp_parsed.qualifiers() == expected_snp_context if "release_name" in snp_ctx_params: assert swhid_rel_parsed.qualifiers() == expected_rev_context def test_get_swhids_info_characters_and_url_escaping(archive_data, directory, origin): snapshot_context = get_snapshot_context(origin_url=origin["url"]) snapshot_context["origin_info"]["url"] = "http://example.org/?project=abc;def%" path = "/foo;/bar%" swhid_info = get_swhids_info( [SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=directory)], snapshot_context=snapshot_context, extra_context={"path": path}, )[0] # check special characters in SWHID have been escaped assert ( swhid_info["context"]["origin"] == "http://example.org/?project%3Dabc%3Bdef%25" ) assert swhid_info["context"]["path"] == "/foo%3B/bar%25" # check special characters in SWHID URL have been escaped parsed_url_swhid = QualifiedSWHID.from_string( swhid_info["swhid_with_context_url"][1:] ) assert ( parsed_url_swhid.qualifiers()["origin"] == "http://example.org/%3Fproject%253Dabc%253Bdef%2525" ) assert parsed_url_swhid.qualifiers()["path"] == "/foo%253B/bar%2525" def test_resolve_swhids_snapshot_context( client, archive_data, origin_with_multiple_visits ): origin_url = origin_with_multiple_visits["url"] visits = archive_data.origin_visit_get(origin_url) visit = random.choice(visits) snapshot = archive_data.snapshot_get(visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) branch_info = None release_info = None for branch_name in sorted(snapshot["branches"]): target_type = snapshot["branches"][branch_name]["target_type"] target = snapshot["branches"][branch_name]["target"] if target_type == "revision" and branch_info is None: branch_info = {"name": branch_name, "revision": target} elif target_type == "release" and release_info is None: release_info = {"name": branch_name, "release": target} if branch_info and release_info: break release_info["name"] = archive_data.release_get(release_info["release"])["name"] directory = archive_data.revision_get(branch_info["revision"])["directory"] directory_content = archive_data.directory_ls(directory) directory_subdirs = [e for e in directory_content if e["type"] == "dir"] directory_subdir = None if directory_subdirs: directory_subdir = random.choice(directory_subdirs) directory_files = [e for e in directory_content if e["type"] == "file"] directory_file = None if directory_files: directory_file = random.choice(directory_files) random_rev_id = random.choice(archive_data.revision_log(head_rev_id))["id"] for snp_ctx_params in ( {}, {"branch_name": branch_info["name"]}, {"release_name": release_info["name"]}, {"revision_id": random_rev_id}, ): snapshot_context = get_snapshot_context( snapshot["id"], origin_url, **snp_ctx_params ) _check_resolved_swhid_browse_url( ObjectType.SNAPSHOT, snapshot["id"], snapshot_context ) rev = head_rev_id if "branch_name" in snp_ctx_params: rev = branch_info["revision"] if "revision_id" in snp_ctx_params: rev = random_rev_id _check_resolved_swhid_browse_url(ObjectType.REVISION, rev, snapshot_context) _check_resolved_swhid_browse_url( ObjectType.DIRECTORY, directory, snapshot_context, path="/" ) if directory_subdir: _check_resolved_swhid_browse_url( ObjectType.DIRECTORY, directory_subdir["target"], snapshot_context, path=f"/{directory_subdir['name']}/", ) if directory_file: _check_resolved_swhid_browse_url( ObjectType.CONTENT, directory_file["target"], snapshot_context, path=f"/{directory_file['name']}", ) _check_resolved_swhid_browse_url( ObjectType.CONTENT, directory_file["target"], snapshot_context, path=f"/{directory_file['name']}", lines="10", ) _check_resolved_swhid_browse_url( ObjectType.CONTENT, directory_file["target"], snapshot_context, path=f"/{directory_file['name']}", lines="10-20", ) def _check_resolved_swhid_browse_url( object_type, object_id, snapshot_context, path=None, lines=None ): snapshot_id = snapshot_context["snapshot_id"] origin_url = None if snapshot_context["origin_info"]: origin_url = snapshot_context["origin_info"]["url"] obj_context = {} query_params = {} if origin_url: obj_context["origin"] = origin_url query_params["origin_url"] = origin_url obj_context["visit"] = gen_swhid(ObjectType.SNAPSHOT, snapshot_id) query_params["snapshot"] = snapshot_id if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY, ObjectType.REVISION): if snapshot_context["release"]: obj_context["anchor"] = gen_swhid( ObjectType.RELEASE, snapshot_context["release_id"] ) query_params["release"] = snapshot_context["release"] else: obj_context["anchor"] = gen_swhid( ObjectType.REVISION, snapshot_context["revision_id"] ) if object_type != ObjectType.REVISION: query_params["revision"] = snapshot_context["revision_id"] if path: obj_context["path"] = path if path != "/": if object_type == ObjectType.CONTENT: query_params["path"] = path[1:] else: query_params["path"] = path[1:-1] if object_type == ObjectType.DIRECTORY: object_id = snapshot_context["root_directory"] if lines: obj_context["lines"] = lines obj_core_swhid = gen_swhid(object_type, object_id) obj_swhid_lower = gen_swhid(object_type, object_id, metadata=obj_context) obj_swhid_upper = obj_swhid_lower.replace(obj_core_swhid, obj_core_swhid.upper(), 1) for obj_swhid in (obj_swhid_lower, obj_swhid_upper): obj_swhid_resolved = resolve_swhid(obj_swhid) url_args = {"sha1_git": object_id} if object_type == ObjectType.CONTENT: url_args = {"query_string": f"sha1_git:{object_id}"} elif object_type == ObjectType.SNAPSHOT: url_args = {"snapshot_id": object_id} expected_url = reverse( f"browse-{object_type.name.lower()}", url_args=url_args, query_params=query_params, ) if lines: lines_number = lines.split("-") expected_url += f"#L{lines_number[0]}" if len(lines_number) > 1: expected_url += f"-L{lines_number[1]}" assert obj_swhid_resolved["browse_url"] == expected_url def test_resolve_swhid_with_escaped_chars(archive_data, directory): origin_url = "http://example.org/?project=abc;" archive_data.origin_add([Origin(url=origin_url)]) origin_swhid_escaped = quote(origin_url, safe="/?:@&") origin_swhid_url_escaped = quote(origin_url, safe="/:@;") swhid = gen_swhid( ObjectType.DIRECTORY, directory, metadata={"origin": origin_swhid_escaped} ) resolved_swhid = resolve_swhid(swhid) assert resolved_swhid["swhid_parsed"].origin == origin_swhid_escaped assert origin_swhid_url_escaped in resolved_swhid["browse_url"] def test_resolve_directory_swhid_path_without_trailing_slash( archive_data, directory_with_subdirs ): dir_content = archive_data.directory_ls(directory_with_subdirs) dir_subdirs = [e for e in dir_content if e["type"] == "dir"] dir_subdir = random.choice(dir_subdirs) dir_subdir_path = dir_subdir["name"] anchor = gen_swhid(ObjectType.DIRECTORY, directory_with_subdirs) swhid = gen_swhid( ObjectType.DIRECTORY, dir_subdir["target"], metadata={"anchor": anchor, "path": "/" + dir_subdir_path}, ) resolved_swhid = resolve_swhid(swhid) browse_url = reverse( "browse-directory", url_args={"sha1_git": directory_with_subdirs}, query_params={"path": dir_subdir_path}, ) assert resolved_swhid["browse_url"] == browse_url def test_resolve_swhid_with_malformed_origin_url(archive_data, directory): origin_url = "http://example.org/project/abc" malformed_origin_url = "http:/example.org/project/abc" archive_data.origin_add([Origin(url=origin_url)]) swhid = gen_swhid( ObjectType.DIRECTORY, directory, metadata={"origin": malformed_origin_url} ) resolved_swhid = resolve_swhid(swhid) assert origin_url in resolved_swhid["browse_url"] def test_resolve_dir_entry_swhid_with_anchor_revision(archive_data, revision): revision_data = archive_data.revision_get(revision) directory = revision_data["directory"] dir_content = archive_data.directory_ls(directory) dir_entry = random.choice(dir_content) rev_swhid = gen_swhid(ObjectType.REVISION, revision) if dir_entry["type"] == "rev": return if dir_entry["type"] == "file": swhid = gen_swhid( ObjectType.CONTENT, dir_entry["checksums"]["sha1_git"], metadata={"anchor": rev_swhid, "path": f"/{dir_entry['name']}"}, ) else: swhid = gen_swhid( ObjectType.DIRECTORY, dir_entry["target"], metadata={"anchor": rev_swhid, "path": f"/{dir_entry['name']}/"}, ) browse_url = reverse( "browse-revision", url_args={"sha1_git": revision}, query_params={"path": dir_entry["name"]}, ) resolved_swhid = resolve_swhid(swhid) assert resolved_swhid["browse_url"] == browse_url def test_resolve_dir_entry_swhid_with_anchor_directory( archive_data, directory_with_subdirs ): dir_content = archive_data.directory_ls(directory_with_subdirs) dir_entry = random.choice( [entry for entry in dir_content if entry["type"] == "dir"] ) dir_swhid = gen_swhid(ObjectType.DIRECTORY, directory_with_subdirs) swhid = gen_swhid( ObjectType.DIRECTORY, dir_entry["target"], metadata={"anchor": dir_swhid, "path": f"/{dir_entry['name']}/"}, ) browse_url = reverse( "browse-directory", url_args={"sha1_git": directory_with_subdirs}, query_params={"path": f"{dir_entry['name']}"}, ) resolved_swhid = resolve_swhid(swhid) assert resolved_swhid["browse_url"] == browse_url def test_resolve_file_entry_swhid_with_anchor_directory( archive_data, directory_with_files ): dir_content = archive_data.directory_ls(directory_with_files) file_entry = random.choice( [entry for entry in dir_content if entry["type"] == "file"] ) dir_swhid = gen_swhid(ObjectType.DIRECTORY, directory_with_files) sha1_git = file_entry["checksums"]["sha1_git"] swhid = gen_swhid( ObjectType.CONTENT, sha1_git, metadata={"anchor": dir_swhid, "path": f"/{file_entry['name']}"}, ) browse_url = reverse( "browse-content", url_args={"query_string": f"sha1_git:{sha1_git}"}, query_params={"path": f"{directory_with_files}/{file_entry['name']}"}, ) resolved_swhid = resolve_swhid(swhid) assert resolved_swhid["browse_url"] == browse_url diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py index f4c67cdf..507a34a7 100644 --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -1,760 +1,789 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta, timezone from functools import partial import re from typing import Optional import uuid import iso8601 import pytest import requests from swh.core.pytest_plugin import get_response_cb from swh.scheduler.utils import create_oneshot_task_dict from swh.web.common.exc import BadInputExc from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, VISIT_STATUS_CREATED, VISIT_STATUS_FULL, VISIT_STATUS_ONGOING, VISIT_STATUS_PARTIAL, SaveOriginRequest, ) from swh.web.common.origin_save import ( _check_origin_exists, _check_visit_type_savable, _visit_type_task, _visit_type_task_privileged, get_savable_visit_types, get_save_origin_requests, get_save_origin_task_info, origin_exists, refresh_save_origin_request_statuses, ) from swh.web.common.typing import ( OriginExistenceCheckInfo, OriginVisitInfo, SaveOriginRequestInfo, ) from swh.web.config import get_config _es_url = "http://esnode1.internal.softwareheritage.org:9200" _es_workers_index_url = "%s/swh_workers-*" % _es_url _origin_url = "https://gitlab.com/inkscape/inkscape" _visit_type = "git" _task_id = 1 @pytest.fixture(autouse=True) def requests_mock_datadir(datadir, requests_mock_datadir): """Override default behavior to deal with post method""" cb = partial(get_response_cb, datadir=datadir) requests_mock_datadir.post(re.compile("https?://"), body=cb) return requests_mock_datadir @pytest.mark.django_db def test_get_save_origin_archived_task_info(swh_scheduler): _get_save_origin_task_info_test(swh_scheduler, task_archived=True) @pytest.mark.django_db def test_get_save_origin_task_info_without_es(swh_scheduler): _get_save_origin_task_info_test(swh_scheduler, es_available=False) def _fill_scheduler_db( swh_scheduler, task_status="completed", task_run_status="eventful", task_archived=False, visit_started_date=None, ): task = task_run = None if not task_archived: task = swh_scheduler.create_tasks( [create_oneshot_task_dict("load-git", repo_url=_origin_url)] )[0] backend_id = str(uuid.uuid4()) if task_status != "next_run_not_scheduled": swh_scheduler.schedule_task_run(task["id"], backend_id) if task_run_status is not None: swh_scheduler.start_task_run(backend_id) task_run = dict( swh_scheduler.end_task_run(backend_id, task_run_status).items() ) return task, task_run @pytest.mark.parametrize( "wrong_type,privileged_user", [ ("dummy", True), ("dumb", False), ("archives", False), # when no privilege, this is rejected ], ) def test_check_visit_type_savable(wrong_type, privileged_user, swh_scheduler): swh_scheduler.add_load_archive_task_type() with pytest.raises(BadInputExc, match="Allowed types"): _check_visit_type_savable(wrong_type, privileged_user) # when privileged_user, the following is accepted though _check_visit_type_savable("archives", True) def test_get_savable_visit_types(swh_scheduler): swh_scheduler.add_load_archive_task_type() default_list = list(_visit_type_task.keys()) assert set(get_savable_visit_types()) == set(default_list) privileged_list = default_list.copy() privileged_list += list(_visit_type_task_privileged.keys()) assert set(get_savable_visit_types(privileged_user=True)) == set(privileged_list) def _get_save_origin_task_info_test( swh_scheduler, task_archived=False, es_available=True, full_info=True ): swh_web_config = get_config() if es_available: swh_web_config.update({"es_workers_index_url": _es_workers_index_url}) else: swh_web_config.update({"es_workers_index_url": ""}) sor = SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, origin_url="https://gitlab.com/inkscape/inkscape", status=SAVE_REQUEST_ACCEPTED, visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), loading_task_id=_task_id, ) task, task_run = _fill_scheduler_db(swh_scheduler, task_archived=task_archived) es_response = requests.post("%s/_search" % _es_workers_index_url).json() task_exec_data = es_response["hits"]["hits"][-1]["_source"] sor_task_info = get_save_origin_task_info(sor.id, full_info=full_info) expected_result = ( { "type": task["type"], "arguments": task["arguments"], "id": task["id"], "backend_id": task_run["backend_id"], "scheduled": task_run["scheduled"], "started": task_run["started"], "ended": task_run["ended"], "status": task_run["status"], "visit_status": sor.visit_status, } if not task_archived else {} ) if es_available and not task_archived: expected_result.update( { "message": task_exec_data["message"], "name": task_exec_data["swh_task_name"], "worker": task_exec_data["hostname"], } ) if not full_info: expected_result.pop("id", None) expected_result.pop("backend_id", None) expected_result.pop("worker", None) if "message" in expected_result: message = "" message_lines = expected_result["message"].split("\n") for line in message_lines: if line.startswith("Traceback"): break message += f"{line}\n" message += message_lines[-1] expected_result["message"] = message assert sor_task_info == expected_result @pytest.mark.django_db def test_get_save_origin_requests_find_visit_date(mocker, swh_scheduler): # create a save request SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archive _fill_scheduler_db(swh_scheduler) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # check visit date has been correctly found sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] == visit_date mock_archive.origin_visit_find_by_date.assert_called_once() # check visit is not searched again when it has been found get_save_origin_requests(_visit_type, _origin_url) mock_archive.origin_visit_find_by_date.assert_called_once() # check visit date are not searched for save requests older than # one month sor = SaveOriginRequest.objects.create( visit_type=_visit_type, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, loading_task_id=_task_id, visit_date=None, ) sor.request_date = datetime.now(tz=timezone.utc) - timedelta(days=31) sor.save() _fill_scheduler_db(swh_scheduler, task_status="disabled", task_run_status="failed") sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 2 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_date"] is None mock_archive.origin_visit_find_by_date.assert_called_once() def _get_save_origin_requests( mocker, swh_scheduler, load_status, visit_status, request_date: Optional[datetime] = None, ): - """Wrapper around the get_origin_save_origin_request call. - - """ + """Wrapper around the get_origin_save_origin_request call.""" SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=visit_status, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status=load_status ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=visit_status, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info sors = get_save_origin_requests(_visit_type, _origin_url) mock_archive.origin_visit_find_by_date.assert_called_once() return sors @pytest.mark.parametrize("visit_date", [None, "some-date"]) def test_from_save_origin_request_to_save_request_info_dict(visit_date): - """Ensure save request to json serializable dict is fine - - """ + """Ensure save request to json serializable dict is fine""" request_date = datetime.now(tz=timezone.utc) _visit_date = request_date + timedelta(minutes=5) if visit_date else None request_date = datetime.now(tz=timezone.utc) note = "request succeeded" sor = SaveOriginRequest( request_date=request_date, visit_type=_visit_type, visit_status=VISIT_STATUS_FULL, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, loading_task_status=None, visit_date=_visit_date, loading_task_id=1, note=note, ) assert sor.to_dict() == SaveOriginRequestInfo( id=sor.id, origin_url=sor.origin_url, visit_type=sor.visit_type, save_request_date=sor.request_date.isoformat(), save_request_status=sor.status, save_task_status=sor.loading_task_status, visit_status=sor.visit_status, visit_date=_visit_date.isoformat() if _visit_date else None, loading_task_id=sor.loading_task_id, note=note, ) def test__check_origin_exists_404(requests_mock): url_ko = "https://example.org/some-inexistant-url" requests_mock.head(url_ko, status_code=404) with pytest.raises(BadInputExc, match="not exist"): _check_origin_exists(url_ko) def test__check_origin_exists_200(requests_mock): url = "https://example.org/url" requests_mock.head(url, status_code=200) # passes the check actual_metadata = _check_origin_exists(url) # and we actually may have retrieved some metadata on the origin assert actual_metadata == origin_exists(url) def test_origin_exists_404(requests_mock): """Origin which does not exist should be reported as inexistent""" url_ko = "https://example.org/some-inexistant-url" requests_mock.head(url_ko, status_code=404) actual_result = origin_exists(url_ko) assert actual_result == OriginExistenceCheckInfo( - origin_url=url_ko, exists=False, last_modified=None, content_length=None, + origin_url=url_ko, + exists=False, + last_modified=None, + content_length=None, ) def test_origin_exists_200_no_data(requests_mock): """Existing origin should be reported as such (no extra information)""" url = "http://example.org/real-url" requests_mock.head( - url, status_code=200, + url, + status_code=200, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( - origin_url=url, exists=True, last_modified=None, content_length=None, + origin_url=url, + exists=True, + last_modified=None, + content_length=None, ) def test_origin_exists_200_with_data(requests_mock): """Existing origin should be reported as such (+ extra information)""" url = "http://example.org/real-url" requests_mock.head( url, status_code=200, headers={ "content-length": "10", "last-modified": "Sun, 21 Aug 2011 16:26:32 GMT", }, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=10, last_modified="2011-08-21T16:26:32", ) def test_origin_exists_internet_archive(requests_mock): """Edge case where an artifact URL to check existence is hosted on the Internet Archive""" url = ( "https://web.archive.org/web/20100705043309/" "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" ) redirect_url = ( "https://web.archive.org/web/20100610004108/" "http://www.cs.unm.edu/~mccune/old-ftp/eqp-09e.tar.gz" ) requests_mock.head( - url, status_code=302, headers={"Location": redirect_url,}, + url, + status_code=302, + headers={ + "Location": redirect_url, + }, ) requests_mock.head( redirect_url, status_code=200, headers={ "X-Archive-Orig-Last-Modified": "Tue, 12 May 2009 22:09:43 GMT", "X-Archive-Orig-Content-Length": "121421", }, ) actual_result = origin_exists(url) assert actual_result == OriginExistenceCheckInfo( origin_url=url, exists=True, content_length=121421, last_modified="2009-05-12T22:09:43", ) def test_origin_exists_200_with_data_unexpected_date_format(requests_mock): """Existing origin should be ok, unexpected last modif time result in no time""" url = "http://example.org/real-url2" # this is parsable but not as expected unexpected_format_date = "Sun, 21 Aug 2021 16:26:32" requests_mock.head( - url, status_code=200, headers={"last-modified": unexpected_format_date,}, + url, + status_code=200, + headers={ + "last-modified": unexpected_format_date, + }, ) actual_result = origin_exists(url) # so the resulting date is None assert actual_result == OriginExistenceCheckInfo( - origin_url=url, exists=True, content_length=None, last_modified=None, + origin_url=url, + exists=True, + content_length=None, + last_modified=None, ) @pytest.mark.django_db -@pytest.mark.parametrize("visit_status", [VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING,]) +@pytest.mark.parametrize( + "visit_status", + [ + VISIT_STATUS_CREATED, + VISIT_STATUS_ONGOING, + ], +) def test_get_save_origin_requests_no_visit_date_found( mocker, swh_scheduler, visit_status ): - """Uneventful visits with failed visit status are marked as failed - - """ + """Uneventful visits with failed visit status are marked as failed""" sors = _get_save_origin_requests( - mocker, swh_scheduler, load_status="scheduled", visit_status=visit_status, + mocker, + swh_scheduler, + load_status="scheduled", + visit_status=visit_status, ) # check no visit date has been found assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_RUNNING assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db -@pytest.mark.parametrize("visit_status", ["not_found", "failed",]) +@pytest.mark.parametrize( + "visit_status", + [ + "not_found", + "failed", + ], +) def test_get_save_origin_requests_no_failed_status_override( mocker, swh_scheduler, visit_status ): - """Uneventful visits with failed statuses (failed, not found) are marked as failed - - """ + """Uneventful visits with failed statuses (failed, not found) are marked as failed""" sors = _get_save_origin_requests( mocker, swh_scheduler, load_status="uneventful", visit_status=visit_status ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED visit_date = sors[0]["visit_date"] assert visit_date is not None sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db @pytest.mark.parametrize( "load_status,visit_status", [ ("eventful", VISIT_STATUS_FULL), ("eventful", VISIT_STATUS_PARTIAL), ("uneventful", VISIT_STATUS_PARTIAL), ], ) def test_get_visit_info_for_save_request_succeeded( mocker, swh_scheduler, load_status, visit_status ): """Nominal scenario, below 30 days, returns something""" sors = _get_save_origin_requests( mocker, swh_scheduler, load_status=load_status, visit_status=visit_status ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status sors = get_save_origin_requests(_visit_type, _origin_url) assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_status"] == visit_status @pytest.mark.django_db -@pytest.mark.parametrize("load_status", ["eventful", "uneventful",]) +@pytest.mark.parametrize( + "load_status", + [ + "eventful", + "uneventful", + ], +) def test_get_visit_info_incomplete_visit_still_successful( mocker, swh_scheduler, load_status ): - """Incomplete visit information, yet the task is updated partially - - """ + """Incomplete visit information, yet the task is updated partially""" sors = _get_save_origin_requests( - mocker, swh_scheduler, load_status=load_status, visit_status=None, + mocker, + swh_scheduler, + load_status=load_status, + visit_status=None, ) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED # As the entry is missing the following information though assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None # It's still detected as to be updated by the refresh routine sors = refresh_save_origin_request_statuses() assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None @pytest.mark.django_db def test_refresh_in_progress_save_request_statuses( mocker, swh_scheduler, api_client, archive_data ): - """Refresh a pending save origins requests and update if the status changes - """ + """Refresh a pending save origins requests and update if the status changes""" date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) visit_started_date = date_now - timedelta(minutes=1) # returned visit status SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=VISIT_STATUS_CREATED, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED, ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_CREATED, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # make the scheduler return a running event _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status="started", visit_started_date=visit_started_date, ) # The visit is detected but still running sors = refresh_save_origin_request_statuses() assert ( mock_archive.origin_visit_find_by_date.called and mock_archive.origin_visit_find_by_date.call_count == 1 ) assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # The status is updated assert sor["save_task_status"] == SAVE_TASK_RUNNING # but the following entries are missing so it's not updated assert sor["visit_date"] is not None assert sor["visit_status"] == VISIT_STATUS_CREATED # make the visit status completed # make the scheduler return a running event _fill_scheduler_db( swh_scheduler, task_status="completed", task_run_status="eventful", visit_started_date=visit_started_date, ) # This time around, the origin returned will have all required information updated # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) mock_archive.origin_visit_find_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 assert ( mock_archive.origin_visit_find_by_date.called and mock_archive.origin_visit_find_by_date.call_count == 1 + 1 ) for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_FULL # Once in final state, a sor should not be updated anymore sors = refresh_save_origin_request_statuses() assert len(sors) == 0 @pytest.mark.django_db def test_refresh_save_request_statuses(mocker, swh_scheduler, api_client, archive_data): - """Refresh filters save origins requests and update if changes - - """ + """Refresh filters save origins requests and update if changes""" date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) # returned visit status SaveOriginRequest.objects.create( request_date=datetime.now(tz=timezone.utc), visit_type=_visit_type, visit_status=None, origin_url=_origin_url, status=SAVE_REQUEST_ACCEPTED, visit_date=None, loading_task_id=_task_id, ) # mock scheduler and archives _fill_scheduler_db( swh_scheduler, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED, ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_CREATED, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_CREATED # A save code now entry is detected for update, but as nothing changes, the entry # remains in the same state sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # Status is not updated as no new information is available on the visit status # and the task status has not moved assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_CREATED # This time around, the origin returned will have all information updated # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( date=visit_date, formatted_date="", metadata={}, origin=_origin_url, snapshot="", # make mypy happy status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34, ) mock_archive.origin_visit_find_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date assert sor["visit_status"] == VISIT_STATUS_FULL # This time, nothing left to update sors = refresh_save_origin_request_statuses() assert len(sors) == 0 diff --git a/swh/web/tests/common/test_origin_visits.py b/swh/web/tests/common/test_origin_visits.py index f6435b6f..614b0557 100644 --- a/swh/web/tests/common/test_origin_visits.py +++ b/swh/web/tests/common/test_origin_visits.py @@ -1,226 +1,256 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta from hypothesis import given, settings import iso8601 import pytest from swh.model.model import OriginVisit, OriginVisitStatus from swh.storage.utils import now from swh.web.common.exc import NotFoundExc from swh.web.common.origin_visits import get_origin_visit, get_origin_visits from swh.web.common.typing import OriginInfo from swh.web.tests.strategies import new_origin, new_snapshots @settings(max_examples=1) @given(new_origin(), new_snapshots(3)) def test_get_origin_visits(mocker, archive_data, new_origin, new_snapshots): from swh.web.common import archive mocker.patch.object(archive, "MAX_LIMIT", 2) archive_data.origin_add([new_origin]) archive_data.snapshot_add(new_snapshots) for i, snapshot in enumerate(new_snapshots): visit_date = now() + timedelta(days=i * 10) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) origin_visits = get_origin_visits(new_origin.to_dict()) assert len(origin_visits) == len(new_snapshots) @given(new_origin(), new_snapshots(5)) def test_get_origin_visit(archive_data, new_origin, new_snapshots): archive_data.origin_add([new_origin]) archive_data.snapshot_add(new_snapshots) visits = [] for i, visit_date in enumerate( map( iso8601.parse_date, [ "2015-07-09T21:09:24+00:00", "2016-02-23T18:05:23.312045+00:00", "2016-03-28T01:35:06.554111+00:00", "2016-06-18T01:22:24.808485+00:00", "2016-08-14T12:10:00.536702+00:00", ], ) ): visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] visits.append(visit) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) origin_info = new_origin.to_dict() visit_id = visits[-1].visit + 1 with pytest.raises(NotFoundExc) as e: visit = get_origin_visit(origin_info, visit_id=visit_id) assert e.match("visit with id %s" % visit_id) assert e.match("Origin %s" % origin_info["url"]) visit_id = visits[1].visit visit = get_origin_visit(origin_info, visit_id=visit_id) assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) visit = get_origin_visit(origin_info, visit_ts="2016-02-23T18:05:23.312045+00:00") assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) visit = get_origin_visit(origin_info, visit_ts="2016-02-20") assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) visit_id = visits[3].visit visit = get_origin_visit(origin_info, visit_ts="2016-06-18T01:22") assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) visit = get_origin_visit(origin_info, visit_ts="2016-06-18 01:22") assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) visit_id = visits[0].visit visit = get_origin_visit(origin_info, visit_ts="2014-01-01") assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) visit_id = visits[-1].visit visit = get_origin_visit(origin_info, visit_ts="2018-01-01") assert visit == archive_data.origin_visit_get_by(new_origin.url, visit_id=visit_id) @given(new_origin(), new_snapshots(6)) def test_get_origin_visit_return_first_valid_full_visit( archive_data, new_origin, new_snapshots ): visits = [] archive_data.origin_add([new_origin]) # create 6 visits, the first three have full status while the # last three have partial status and set a null snapshot for # the last four visits for i, snp in enumerate(new_snapshots): visit_date = now() + timedelta(days=i * 10) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full" if i < 3 else "partial", snapshot=new_snapshots[i].id if i < 2 else None, ) if i < 2: archive_data.origin_visit_status_add([visit_status]) visits.append(visit.visit) # should return the second visit expected_visit = archive_data.origin_visit_get_by(new_origin.url, visits[1]) assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit @given(new_origin(), new_snapshots(6)) def test_get_origin_visit_non_resolvable_snapshots( archive_data, new_origin, new_snapshots ): visits = [] archive_data.origin_add([new_origin]) # create 6 full visits, the first three have resolvable snapshots # while the last three have non resolvable snapshots for i, snp in enumerate(new_snapshots): visit_date = now() + timedelta(days=i * 10) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full", snapshot=new_snapshots[i].id, ) if i < 3: archive_data.origin_visit_status_add([visit_status]) visits.append(visit.visit) # should return the third visit expected_visit = archive_data.origin_visit_get_by(new_origin.url, visits[2]) assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit @given(new_origin(), new_snapshots(6)) def test_get_origin_visit_return_first_valid_partial_visit( archive_data, new_origin, new_snapshots ): visits = [] archive_data.origin_add([new_origin]) # create 6 visits, the first three have full status but null snapshot # while the last three have partial status with valid snapshot for i, snp in enumerate(new_snapshots): visit_date = now() + timedelta(days=i * 10) visit = archive_data.origin_visit_add( - [OriginVisit(origin=new_origin.url, date=visit_date, type="git",)] + [ + OriginVisit( + origin=new_origin.url, + date=visit_date, + type="git", + ) + ] )[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full" if i < 3 else "partial", snapshot=new_snapshots[i].id if i > 2 else None, ) if i > 2: archive_data.origin_visit_status_add([visit_status]) visits.append(visit.visit) # should return the last visit expected_visit = archive_data.origin_visit_get_by(new_origin.url, visits[-1]) assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit def test_get_origin_visit_latest_snapshot(mocker, origin_with_multiple_visits): origin_visits = get_origin_visits(origin_with_multiple_visits) first_visit = origin_visits[0] latest_visit = origin_visits[-1] mock_get_origin_visits = mocker.patch( "swh.web.common.origin_visits.get_origin_visits" ) mock_get_origin_visits.return_value = origin_visits visit = get_origin_visit( origin_with_multiple_visits, snapshot_id=latest_visit["snapshot"] ) assert visit == latest_visit assert not mock_get_origin_visits.called visit = get_origin_visit( origin_with_multiple_visits, snapshot_id=first_visit["snapshot"] ) assert visit == first_visit assert mock_get_origin_visits.called diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py index a1c07db2..8dc5e7fc 100644 --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -1,1249 +1,1235 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from datetime import timedelta import functools import json import os import random import shutil from subprocess import PIPE, run import sys import time from typing import Any, Dict, List, Optional from _pytest.python import Function from hypothesis import HealthCheck, settings import pytest from django.contrib.auth.models import User from django.core.cache import cache from django.test.utils import setup_databases # type: ignore from rest_framework.test import APIClient, APIRequestFactory from swh.model.hashutil import ( ALGORITHMS, DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex, ) from swh.model.model import Content, Directory from swh.model.swhids import ObjectType from swh.scheduler.tests.common import TASK_TYPES from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.revisions_walker import get_revisions_walker from swh.storage.algos.snapshot import snapshot_get_all_branches, snapshot_get_latest from swh.web.auth.utils import ( ADD_FORGE_MODERATOR_PERMISSION, MAILMAP_ADMIN_PERMISSION, MAILMAP_PERMISSION, OIDC_SWH_WEB_CLIENT_ID, ) from swh.web.common import converters from swh.web.common.origin_save import get_scheduler_load_task_types from swh.web.common.typing import OriginVisitInfo from swh.web.common.utils import browsers_supported_image_mimes from swh.web.config import get_config from swh.web.tests.data import ( get_tests_data, override_storages, random_content, random_sha1, random_sha256, ) from swh.web.tests.utils import create_django_permission os.environ["LC_ALL"] = "C.UTF-8" # Used to skip some tests ctags_json_missing = ( shutil.which("ctags") is None or b"+json" not in run(["ctags", "--version"], stdout=PIPE).stdout ) fossology_missing = shutil.which("nomossa") is None # Register some hypothesis profiles settings.register_profile("default", settings()) # we use getattr here to keep mypy happy regardless hypothesis version function_scoped_fixture_check = ( [getattr(HealthCheck, "function_scoped_fixture")] if hasattr(HealthCheck, "function_scoped_fixture") else [] ) suppress_health_check = [ HealthCheck.too_slow, HealthCheck.filter_too_much, ] + function_scoped_fixture_check settings.register_profile( - "swh-web", settings(deadline=None, suppress_health_check=suppress_health_check,), + "swh-web", + settings( + deadline=None, + suppress_health_check=suppress_health_check, + ), ) settings.register_profile( "swh-web-fast", settings( - deadline=None, max_examples=5, suppress_health_check=suppress_health_check, + deadline=None, + max_examples=5, + suppress_health_check=suppress_health_check, ), ) def pytest_addoption(parser): parser.addoption("--swh-web-random-seed", action="store", default=None) def pytest_configure(config): # Use fast hypothesis profile by default if none has been # explicitly specified in pytest option if config.getoption("--hypothesis-profile") is None: settings.load_profile("swh-web-fast") # Small hack in order to be able to run the unit tests # without static assets generated by webpack. # Those assets are not really needed for the Python tests # but the django templates will fail to load due to missing # generated file webpack-stats.json describing the js and css # files to include. # So generate a dummy webpack-stats.json file to overcome # that issue. test_dir = os.path.dirname(__file__) # location of the static folder when running tests through tox data_dir = os.path.join(sys.prefix, "share/swh/web") static_dir = os.path.join(data_dir, "static") if not os.path.exists(static_dir): # location of the static folder when running tests locally with pytest static_dir = os.path.join(test_dir, "../../../static") webpack_stats = os.path.join(static_dir, "webpack-stats.json") if os.path.exists(webpack_stats): return bundles_dir = os.path.join(test_dir, "../../../assets/src/bundles") if not os.path.exists(bundles_dir): # location of the bundles folder when running tests with tox bundles_dir = os.path.join(data_dir, "assets/src/bundles") _, bundles, _ = next(os.walk(bundles_dir)) mock_webpack_stats = { "status": "done", "publicPath": "/static", "chunks": {}, "assets": {}, } for bundle in bundles: asset = f"js/{bundle}.js" mock_webpack_stats["chunks"][bundle] = [asset] mock_webpack_stats["assets"][asset] = { "name": asset, "publicPath": f"/static/{asset}", } with open(webpack_stats, "w") as outfile: json.dump(mock_webpack_stats, outfile) _swh_web_custom_section = "swh-web custom section" _random_seed_cache_key = "swh-web/random-seed" @pytest.fixture(scope="function", autouse=True) def random_seed(pytestconfig): state = random.getstate() seed = pytestconfig.getoption("--swh-web-random-seed") if seed is None: seed = time.time() seed = int(seed) cache.set(_random_seed_cache_key, seed) random.seed(seed) yield seed random.setstate(state) def pytest_report_teststatus(report, *args): if report.when == "call" and report.outcome == "failed": seed = cache.get(_random_seed_cache_key, None) line = ( f'FAILED {report.nodeid}: Use "pytest --swh-web-random-seed={seed} ' f'{report.nodeid}" to reproduce that test failure with same inputs' ) report.sections.append((_swh_web_custom_section, line)) def pytest_terminal_summary(terminalreporter, *args): reports = terminalreporter.getreports("failed") content = os.linesep.join( text for report in reports for secname, text in report.sections if secname == _swh_web_custom_section ) if content: terminalreporter.ensure_newline() terminalreporter.section(_swh_web_custom_section, sep="-", blue=True, bold=True) terminalreporter.line(content) # Clear Django cache before each test @pytest.fixture(autouse=True) def django_cache_cleared(): cache.clear() # Alias rf fixture from pytest-django @pytest.fixture def request_factory(rf): return rf # Fixture to get test client from Django REST Framework @pytest.fixture def api_client(): return APIClient() # Fixture to get API request factory from Django REST Framework @pytest.fixture def api_request_factory(): return APIRequestFactory() # Initialize tests data @pytest.fixture(scope="function", autouse=True) def tests_data(): data = get_tests_data(reset=True) # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module override_storages( data["storage"], data["idx_storage"], data["search"], data["counters"] ) return data @pytest.fixture(scope="function") def sha1(): - """Fixture returning a valid hexadecimal sha1 value. - """ + """Fixture returning a valid hexadecimal sha1 value.""" return random_sha1() @pytest.fixture(scope="function") def invalid_sha1(): - """Fixture returning an invalid sha1 representation. - """ + """Fixture returning an invalid sha1 representation.""" return hash_to_hex(bytes(random.randint(0, 255) for _ in range(50))) @pytest.fixture(scope="function") def sha256(): - """Fixture returning a valid hexadecimal sha256 value. - """ + """Fixture returning a valid hexadecimal sha256 value.""" return random_sha256() def _known_swh_objects(tests_data, object_type): return tests_data[object_type] @pytest.fixture(scope="function") def content(tests_data): - """Fixture returning a random content ingested into the test archive. - """ + """Fixture returning a random content ingested into the test archive.""" return random.choice(_known_swh_objects(tests_data, "contents")) @pytest.fixture(scope="function") def contents(tests_data): - """Fixture returning random contents ingested into the test archive. - """ + """Fixture returning random contents ingested into the test archive.""" return random.choices( _known_swh_objects(tests_data, "contents"), k=random.randint(2, 8) ) def _new_content(tests_data): while True: new_content = random_content() sha1_bytes = hash_to_bytes(new_content["sha1"]) if tests_data["storage"].content_get_data(sha1_bytes) is None: return new_content @pytest.fixture(scope="function") def unknown_content(tests_data): - """Fixture returning a random content not ingested into the test archive. - """ + """Fixture returning a random content not ingested into the test archive.""" return _new_content(tests_data) @pytest.fixture(scope="function") def unknown_contents(tests_data): - """Fixture returning random contents not ingested into the test archive. - """ + """Fixture returning random contents not ingested into the test archive.""" new_contents = [] new_content_ids = set() nb_contents = random.randint(2, 8) while len(new_contents) != nb_contents: new_content = _new_content(tests_data) if new_content["sha1"] not in new_content_ids: new_contents.append(new_content) new_content_ids.add(new_content["sha1"]) return list(new_contents) @pytest.fixture(scope="function") def empty_content(): - """Fixture returning the empty content ingested into the test archive. - """ + """Fixture returning the empty content ingested into the test archive.""" empty_content = Content.from_data(data=b"").to_dict() for algo in DEFAULT_ALGORITHMS: empty_content[algo] = hash_to_hex(empty_content[algo]) return empty_content @functools.lru_cache(maxsize=None) def _content_text(): return list( filter( lambda c: c["mimetype"].startswith("text/"), _known_swh_objects(get_tests_data(), "contents"), ) ) @pytest.fixture(scope="function") def content_text(): """ Fixture returning a random textual content ingested into the test archive. """ return random.choice(_content_text()) @functools.lru_cache(maxsize=None) def _content_text_non_utf8(): return list( filter( lambda c: c["mimetype"].startswith("text/") and c["encoding"] not in ("utf-8", "us-ascii"), _known_swh_objects(get_tests_data(), "contents"), ) ) @pytest.fixture(scope="function") def content_text_non_utf8(): """Fixture returning a random textual content not encoded to UTF-8 ingested into the test archive. """ return random.choice(_content_text_non_utf8()) @functools.lru_cache(maxsize=None) def _content_application_no_highlight(): return list( filter( lambda c: c["mimetype"].startswith("application/") and c["hljs_language"] == "plaintext", _known_swh_objects(get_tests_data(), "contents"), ) ) @pytest.fixture(scope="function") def content_application_no_highlight(): """Fixture returning a random textual content with mimetype starting with application/ and no detected programming language to highlight ingested into the test archive. """ return random.choice(_content_application_no_highlight()) @functools.lru_cache(maxsize=None) def _content_text_no_highlight(): return list( filter( lambda c: c["mimetype"].startswith("text/") and c["hljs_language"] == "plaintext", _known_swh_objects(get_tests_data(), "contents"), ) ) @pytest.fixture(scope="function") def content_text_no_highlight(): """Fixture returning a random textual content with no detected programming language to highlight ingested into the test archive. """ return random.choice(_content_text_no_highlight()) @functools.lru_cache(maxsize=None) def _content_image_type(): return list( filter( lambda c: c["mimetype"] in browsers_supported_image_mimes, _known_swh_objects(get_tests_data(), "contents"), ) ) @pytest.fixture(scope="function") def content_image_type(): - """Fixture returning a random image content ingested into the test archive. - """ + """Fixture returning a random image content ingested into the test archive.""" return random.choice(_content_image_type()) @functools.lru_cache(maxsize=None) def _content_unsupported_image_type_rendering(): return list( filter( lambda c: c["mimetype"].startswith("image/") and c["mimetype"] not in browsers_supported_image_mimes, _known_swh_objects(get_tests_data(), "contents"), ) ) @pytest.fixture(scope="function") def content_unsupported_image_type_rendering(): """Fixture returning a random image content ingested into the test archive that can not be rendered by browsers. """ return random.choice(_content_unsupported_image_type_rendering()) @functools.lru_cache(maxsize=None) def _content_utf8_detected_as_binary(): def utf8_binary_detected(content): if content["encoding"] != "binary": return False try: content["raw_data"].decode("utf-8") except Exception: return False else: return True return list( filter(utf8_binary_detected, _known_swh_objects(get_tests_data(), "contents")) ) @pytest.fixture(scope="function") def content_utf8_detected_as_binary(): """Fixture returning a random textual content detected as binary by libmagic while they are valid UTF-8 encoded files. """ return random.choice(_content_utf8_detected_as_binary()) @pytest.fixture(scope="function") def contents_with_ctags(): """ Fixture returning contents ingested into the test archive. Those contents are ctags compatible, that is running ctags on those lay results. """ return { "sha1s": [ "0ab37c02043ebff946c1937523f60aadd0844351", "15554cf7608dde6bfefac7e3d525596343a85b6f", "2ce837f1489bdfb8faf3ebcc7e72421b5bea83bd", "30acd0b47fc25e159e27a980102ddb1c4bea0b95", "4f81f05aaea3efb981f9d90144f746d6b682285b", "5153aa4b6e4455a62525bc4de38ed0ff6e7dd682", "59d08bafa6a749110dfb65ba43a61963d5a5bf9f", "7568285b2d7f31ae483ae71617bd3db873deaa2c", "7ed3ee8e94ac52ba983dd7690bdc9ab7618247b4", "8ed7ef2e7ff9ed845e10259d08e4145f1b3b5b03", "9b3557f1ab4111c8607a4f2ea3c1e53c6992916c", "9c20da07ed14dc4fcd3ca2b055af99b2598d8bdd", "c20ceebd6ec6f7a19b5c3aebc512a12fbdc9234b", "e89e55a12def4cd54d5bff58378a3b5119878eb7", "e8c0654fe2d75ecd7e0b01bee8a8fc60a130097e", "eb6595e559a1d34a2b41e8d4835e0e4f98a5d2b5", ], "symbol_name": "ABS", } @pytest.fixture(scope="function") def directory(tests_data): - """Fixture returning a random directory ingested into the test archive. - """ + """Fixture returning a random directory ingested into the test archive.""" return random.choice(_known_swh_objects(tests_data, "directories")) @functools.lru_cache(maxsize=None) def _directory_with_entry_type(type_): tests_data = get_tests_data() return list( filter( lambda d: any( [ e["type"] == type_ for e in list(tests_data["storage"].directory_ls(hash_to_bytes(d))) ] ), _known_swh_objects(tests_data, "directories"), ) ) @pytest.fixture(scope="function") def directory_with_subdirs(): """Fixture returning a random directory containing sub directories ingested into the test archive. """ return random.choice(_directory_with_entry_type("dir")) @pytest.fixture(scope="function") def directory_with_files(): - """Fixture returning a random directory containing at least one regular file. - """ + """Fixture returning a random directory containing at least one regular file.""" return random.choice(_directory_with_entry_type("file")) @pytest.fixture(scope="function") def unknown_directory(tests_data): - """Fixture returning a random directory not ingested into the test archive. - """ + """Fixture returning a random directory not ingested into the test archive.""" while True: new_directory = random_sha1() sha1_bytes = hash_to_bytes(new_directory) if list(tests_data["storage"].directory_missing([sha1_bytes])): return new_directory @pytest.fixture(scope="function") def empty_directory(): - """Fixture returning the empty directory ingested into the test archive. - """ + """Fixture returning the empty directory ingested into the test archive.""" return Directory(entries=()).id.hex() @pytest.fixture(scope="function") def revision(tests_data): - """Fixturereturning a random revision ingested into the test archive. - """ + """Fixturereturning a random revision ingested into the test archive.""" return random.choice(_known_swh_objects(tests_data, "revisions")) @pytest.fixture(scope="function") def revisions(tests_data): - """Fixture returning random revisions ingested into the test archive. - """ + """Fixture returning random revisions ingested into the test archive.""" return random.choices( - _known_swh_objects(tests_data, "revisions"), k=random.randint(2, 8), + _known_swh_objects(tests_data, "revisions"), + k=random.randint(2, 8), ) @pytest.fixture(scope="function") def revisions_list(tests_data): - """Fixture returning random revisions ingested into the test archive. - """ + """Fixture returning random revisions ingested into the test archive.""" def gen_revisions_list(size): - return random.choices(_known_swh_objects(tests_data, "revisions"), k=size,) + return random.choices( + _known_swh_objects(tests_data, "revisions"), + k=size, + ) return gen_revisions_list @pytest.fixture(scope="function") def unknown_revision(tests_data): - """Fixture returning a random revision not ingested into the test archive. - """ + """Fixture returning a random revision not ingested into the test archive.""" while True: new_revision = random_sha1() sha1_bytes = hash_to_bytes(new_revision) if tests_data["storage"].revision_get([sha1_bytes])[0] is None: return new_revision def _get_origin_dfs_revisions_walker(tests_data): storage = tests_data["storage"] origin = random.choice(tests_data["origins"][:-1]) snapshot = snapshot_get_latest(storage, origin["url"]) if snapshot.branches[b"HEAD"].target_type.value == "alias": target = snapshot.branches[b"HEAD"].target head = snapshot.branches[target].target else: head = snapshot.branches[b"HEAD"].target return get_revisions_walker("dfs", storage, head) @functools.lru_cache(maxsize=None) def _ancestor_revisions_data(): # get a dfs revisions walker for one of the origins # loaded into the test archive revisions_walker = _get_origin_dfs_revisions_walker(get_tests_data()) master_revisions = [] children = defaultdict(list) init_rev_found = False # get revisions only authored in the master branch for rev in revisions_walker: for rev_p in rev["parents"]: children[rev_p].append(rev["id"]) if not init_rev_found: master_revisions.append(rev) if not rev["parents"]: init_rev_found = True return master_revisions, children @pytest.fixture(scope="function") def ancestor_revisions(): """Fixture returning a pair of revisions ingested into the test archive with an ancestor relation. """ master_revisions, children = _ancestor_revisions_data() # head revision root_rev = master_revisions[0] # pick a random revision, different from head, only authored # in the master branch ancestor_rev_idx = random.choice(list(range(1, len(master_revisions) - 1))) ancestor_rev = master_revisions[ancestor_rev_idx] ancestor_child_revs = children[ancestor_rev["id"]] return { "sha1_git_root": hash_to_hex(root_rev["id"]), "sha1_git": hash_to_hex(ancestor_rev["id"]), "children": [hash_to_hex(r) for r in ancestor_child_revs], } @functools.lru_cache(maxsize=None) def _non_ancestor_revisions_data(): # get a dfs revisions walker for one of the origins # loaded into the test archive revisions_walker = _get_origin_dfs_revisions_walker(get_tests_data()) merge_revs = [] children = defaultdict(list) # get all merge revisions for rev in revisions_walker: if len(rev["parents"]) > 1: merge_revs.append(rev) for rev_p in rev["parents"]: children[rev_p].append(rev["id"]) return merge_revs, children @pytest.fixture(scope="function") def non_ancestor_revisions(): """Fixture returning a pair of revisions ingested into the test archive with no ancestor relation. """ merge_revs, children = _non_ancestor_revisions_data() # find a merge revisions whose parents have a unique child revision random.shuffle(merge_revs) selected_revs = None for merge_rev in merge_revs: if all(len(children[rev_p]) == 1 for rev_p in merge_rev["parents"]): selected_revs = merge_rev["parents"] return { "sha1_git_root": hash_to_hex(selected_revs[0]), "sha1_git": hash_to_hex(selected_revs[1]), } @pytest.fixture(scope="function") def revision_with_submodules(): """Fixture returning a revision that is known to point to a directory with revision entries (aka git submodules) """ return { "rev_sha1_git": "ffcb69001f3f6745dfd5b48f72ab6addb560e234", "rev_dir_sha1_git": "d92a21446387fa28410e5a74379c934298f39ae2", "rev_dir_rev_path": "libtess2", } @pytest.fixture(scope="function") def release(tests_data): - """Fixture returning a random release ingested into the test archive. - """ + """Fixture returning a random release ingested into the test archive.""" return random.choice(_known_swh_objects(tests_data, "releases")) @pytest.fixture(scope="function") def releases(tests_data): - """Fixture returning random releases ingested into the test archive. - """ + """Fixture returning random releases ingested into the test archive.""" return random.choices( _known_swh_objects(tests_data, "releases"), k=random.randint(2, 8) ) @pytest.fixture(scope="function") def unknown_release(tests_data): - """Fixture returning a random release not ingested into the test archive. - """ + """Fixture returning a random release not ingested into the test archive.""" while True: new_release = random_sha1() sha1_bytes = hash_to_bytes(new_release) if tests_data["storage"].release_get([sha1_bytes])[0] is None: return new_release @pytest.fixture(scope="function") def snapshot(tests_data): - """Fixture returning a random snapshot ingested into the test archive. - """ + """Fixture returning a random snapshot ingested into the test archive.""" return random.choice(_known_swh_objects(tests_data, "snapshots")) @pytest.fixture(scope="function") def unknown_snapshot(tests_data): - """Fixture returning a random snapshot not ingested into the test archive. - """ + """Fixture returning a random snapshot not ingested into the test archive.""" while True: new_snapshot = random_sha1() sha1_bytes = hash_to_bytes(new_snapshot) if tests_data["storage"].snapshot_get_branches(sha1_bytes) is None: return new_snapshot @pytest.fixture(scope="function") def origin(tests_data): - """Fixture returning a random origin ingested into the test archive. - """ + """Fixture returning a random origin ingested into the test archive.""" return random.choice(_known_swh_objects(tests_data, "origins")) @functools.lru_cache(maxsize=None) def _origin_with_multiple_visits(): tests_data = get_tests_data() origins = [] storage = tests_data["storage"] for origin in tests_data["origins"]: visit_page = storage.origin_visit_get(origin["url"]) if len(visit_page.results) > 1: origins.append(origin) return origins @pytest.fixture(scope="function") def origin_with_multiple_visits(): """Fixture returning a random origin with multiple visits ingested into the test archive. """ return random.choice(_origin_with_multiple_visits()) @functools.lru_cache(maxsize=None) def _origin_with_releases(): tests_data = get_tests_data() origins = [] for origin in tests_data["origins"]: snapshot = snapshot_get_latest(tests_data["storage"], origin["url"]) if any([b.target_type.value == "release" for b in snapshot.branches.values()]): origins.append(origin) return origins @pytest.fixture(scope="function") def origin_with_releases(): - """Fixture returning a random origin with releases ingested into the test archive. - """ + """Fixture returning a random origin with releases ingested into the test archive.""" return random.choice(_origin_with_releases()) @functools.lru_cache(maxsize=None) def _origin_with_pull_request_branches(): tests_data = get_tests_data() origins = [] storage = tests_data["storage"] for origin in storage.origin_list(limit=1000).results: snapshot = snapshot_get_latest(storage, origin.url) if any([b"refs/pull/" in b for b in snapshot.branches]): origins.append(origin) return origins @pytest.fixture(scope="function") def origin_with_pull_request_branches(): """Fixture returning a random origin with pull request branches ingested into the test archive. """ return random.choice(_origin_with_pull_request_branches()) @functools.lru_cache(maxsize=None) def _object_type_swhid(object_type): return list( filter( lambda swhid: swhid.object_type == object_type, _known_swh_objects(get_tests_data(), "swhids"), ) ) @pytest.fixture(scope="function") def content_swhid(): """Fixture returning a qualified SWHID for a random content object ingested into the test archive. """ return random.choice(_object_type_swhid(ObjectType.CONTENT)) @pytest.fixture(scope="function") def directory_swhid(): """Fixture returning a qualified SWHID for a random directory object ingested into the test archive. """ return random.choice(_object_type_swhid(ObjectType.DIRECTORY)) @pytest.fixture(scope="function") def release_swhid(): """Fixture returning a qualified SWHID for a random release object ingested into the test archive. """ return random.choice(_object_type_swhid(ObjectType.RELEASE)) @pytest.fixture(scope="function") def revision_swhid(): """Fixture returning a qualified SWHID for a random revision object ingested into the test archive. """ return random.choice(_object_type_swhid(ObjectType.REVISION)) @pytest.fixture(scope="function") def snapshot_swhid(): """Fixture returning a qualified SWHID for a snapshot object ingested into the test archive. """ return random.choice(_object_type_swhid(ObjectType.SNAPSHOT)) # Fixture to manipulate data from a sample archive used in the tests @pytest.fixture(scope="function") def archive_data(tests_data): return _ArchiveData(tests_data) # Fixture to manipulate indexer data from a sample archive used in the tests @pytest.fixture(scope="function") def indexer_data(tests_data): return _IndexerData(tests_data) # Custom data directory for requests_mock @pytest.fixture def datadir(): return os.path.join(os.path.abspath(os.path.dirname(__file__)), "resources") class _ArchiveData: """ Helper class to manage data from a sample test archive. It is initialized with a reference to an in-memory storage containing raw tests data. It is basically a proxy to Storage interface but it overrides some methods to retrieve those tests data in a json serializable format in order to ease tests implementation. """ def __init__(self, tests_data): self.storage = tests_data["storage"] def __getattr__(self, key): if key == "storage": raise AttributeError(key) # Forward calls to non overridden Storage methods to wrapped # storage instance return getattr(self.storage, key) def content_find(self, content: Dict[str, Any]) -> Dict[str, Any]: cnt_ids_bytes = { algo_hash: hash_to_bytes(content[algo_hash]) for algo_hash in ALGORITHMS if content.get(algo_hash) } cnt = self.storage.content_find(cnt_ids_bytes) return converters.from_content(cnt[0].to_dict()) if cnt else cnt def content_get(self, cnt_id: str) -> Dict[str, Any]: cnt_id_bytes = hash_to_bytes(cnt_id) content = self.storage.content_get([cnt_id_bytes])[0] if content: content_d = content.to_dict() content_d.pop("ctime", None) else: content_d = None return converters.from_swh( content_d, hashess={"sha1", "sha1_git", "sha256", "blake2s256"} ) def content_get_data(self, cnt_id: str) -> Optional[Dict[str, Any]]: cnt_id_bytes = hash_to_bytes(cnt_id) cnt_data = self.storage.content_get_data(cnt_id_bytes) if cnt_data is None: return None return converters.from_content({"data": cnt_data, "sha1": cnt_id_bytes}) def directory_get(self, dir_id): return {"id": dir_id, "content": self.directory_ls(dir_id)} def directory_ls(self, dir_id): cnt_id_bytes = hash_to_bytes(dir_id) dir_content = map( converters.from_directory_entry, self.storage.directory_ls(cnt_id_bytes) ) return list(dir_content) def release_get(self, rel_id: str) -> Optional[Dict[str, Any]]: rel_id_bytes = hash_to_bytes(rel_id) rel_data = self.storage.release_get([rel_id_bytes])[0] return converters.from_release(rel_data) if rel_data else None def revision_get(self, rev_id: str) -> Optional[Dict[str, Any]]: rev_id_bytes = hash_to_bytes(rev_id) rev_data = self.storage.revision_get([rev_id_bytes])[0] return converters.from_revision(rev_data) if rev_data else None def revision_log(self, rev_id, limit=None): rev_id_bytes = hash_to_bytes(rev_id) return list( map( converters.from_revision, self.storage.revision_log([rev_id_bytes], limit=limit), ) ) def snapshot_get_latest(self, origin_url): snp = snapshot_get_latest(self.storage, origin_url) return converters.from_snapshot(snp.to_dict()) def origin_get(self, origin_urls): origins = self.storage.origin_get(origin_urls) return [converters.from_origin(o.to_dict()) for o in origins] def origin_visit_get(self, origin_url): next_page_token = None visits = [] while True: visit_page = self.storage.origin_visit_get( origin_url, page_token=next_page_token ) next_page_token = visit_page.next_page_token for visit in visit_page.results: visit_status = self.storage.origin_visit_status_get_latest( origin_url, visit.visit ) visits.append( converters.from_origin_visit( {**visit_status.to_dict(), "type": visit.type} ) ) if not next_page_token: break return visits def origin_visit_get_by(self, origin_url: str, visit_id: int) -> OriginVisitInfo: visit = self.storage.origin_visit_get_by(origin_url, visit_id) assert visit is not None visit_status = self.storage.origin_visit_status_get_latest(origin_url, visit_id) assert visit_status is not None return converters.from_origin_visit( {**visit_status.to_dict(), "type": visit.type} ) def origin_visit_status_get_latest( self, origin_url, type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, ): visit_status = origin_get_latest_visit_status( self.storage, origin_url, type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, ) return ( converters.from_origin_visit(visit_status.to_dict()) if visit_status else None ) def snapshot_get(self, snapshot_id): snp = snapshot_get_all_branches(self.storage, hash_to_bytes(snapshot_id)) return converters.from_snapshot(snp.to_dict()) def snapshot_get_branches( self, snapshot_id, branches_from="", branches_count=1000, target_types=None ): partial_branches = self.storage.snapshot_get_branches( hash_to_bytes(snapshot_id), branches_from.encode(), branches_count, target_types, ) return converters.from_partial_branches(partial_branches) def snapshot_get_head(self, snapshot): if snapshot["branches"]["HEAD"]["target_type"] == "alias": target = snapshot["branches"]["HEAD"]["target"] head = snapshot["branches"][target]["target"] else: head = snapshot["branches"]["HEAD"]["target"] return head def snapshot_count_branches(self, snapshot_id): counts = dict.fromkeys(("alias", "release", "revision"), 0) counts.update(self.storage.snapshot_count_branches(hash_to_bytes(snapshot_id))) counts.pop(None, None) return counts class _IndexerData: """ Helper class to manage indexer tests data It is initialized with a reference to an in-memory indexer storage containing raw tests data. It also defines class methods to retrieve those tests data in a json serializable format in order to ease tests implementation. """ def __init__(self, tests_data): self.idx_storage = tests_data["idx_storage"] self.mimetype_indexer = tests_data["mimetype_indexer"] self.license_indexer = tests_data["license_indexer"] self.ctags_indexer = tests_data["ctags_indexer"] def content_add_mimetype(self, cnt_id): self.mimetype_indexer.run([hash_to_bytes(cnt_id)]) def content_get_mimetype(self, cnt_id): mimetype = self.idx_storage.content_mimetype_get([hash_to_bytes(cnt_id)])[ 0 ].to_dict() return converters.from_filetype(mimetype) def content_add_license(self, cnt_id): self.license_indexer.run([hash_to_bytes(cnt_id)]) def content_get_license(self, cnt_id): cnt_id_bytes = hash_to_bytes(cnt_id) licenses = self.idx_storage.content_fossology_license_get([cnt_id_bytes]) for license in licenses: yield converters.from_swh(license.to_dict(), hashess={"id"}) def content_add_ctags(self, cnt_id): self.ctags_indexer.run([hash_to_bytes(cnt_id)]) def content_get_ctags(self, cnt_id): cnt_id_bytes = hash_to_bytes(cnt_id) ctags = self.idx_storage.content_ctags_get([cnt_id_bytes]) for ctag in ctags: yield converters.from_swh(ctag, hashess={"id"}) @pytest.fixture def keycloak_oidc(keycloak_oidc, mocker): keycloak_config = get_config()["keycloak"] keycloak_oidc.server_url = keycloak_config["server_url"] keycloak_oidc.realm_name = keycloak_config["realm_name"] keycloak_oidc.client_id = OIDC_SWH_WEB_CLIENT_ID keycloak_oidc_client = mocker.patch("swh.web.auth.views.keycloak_oidc_client") keycloak_oidc_client.return_value = keycloak_oidc return keycloak_oidc @pytest.fixture def subtest(request): """A hack to explicitly set up and tear down fixtures. This fixture allows you to set up and tear down fixtures within the test function itself. This is useful (necessary!) for using Hypothesis inside pytest, as hypothesis will call the test function multiple times, without setting up or tearing down fixture state as it is normally the case. Copied from the pytest-subtesthack project, public domain license (https://github.com/untitaker/pytest-subtesthack). """ parent_test = request.node def inner(func): if hasattr(Function, "from_parent"): item = Function.from_parent( parent_test, name=request.function.__name__ + "[]", originalname=request.function.__name__, callobj=func, ) else: item = Function( name=request.function.__name__ + "[]", parent=parent_test, callobj=func ) nextitem = parent_test # prevents pytest from tearing down module fixtures item.ihook.pytest_runtest_setup(item=item) item.ihook.pytest_runtest_call(item=item) item.ihook.pytest_runtest_teardown(item=item, nextitem=nextitem) return inner @pytest.fixture def swh_scheduler(swh_scheduler): config = get_config() scheduler = config["scheduler"] config["scheduler"] = swh_scheduler # create load-git and load-hg task types for task_type in TASK_TYPES.values(): # see https://forge.softwareheritage.org/rDSCHc46ffadf7adf24c7eb3ffce062e8ade3818c79cc # noqa task_type["type"] = task_type["type"].replace("load-test-", "load-", 1) swh_scheduler.create_task_type(task_type) # create load-svn task type swh_scheduler.create_task_type( { "type": "load-svn", "description": "Update a Subversion repository", "backend_name": "swh.loader.svn.tasks.DumpMountAndLoadSvnRepository", "default_interval": timedelta(days=64), "min_interval": timedelta(hours=12), "max_interval": timedelta(days=64), "backoff_factor": 2, "max_queue_length": None, "num_retries": 7, "retry_delay": timedelta(hours=2), } ) # create load-cvs task type swh_scheduler.create_task_type( { "type": "load-cvs", "description": "Update a CVS repository", "backend_name": "swh.loader.cvs.tasks.DumpMountAndLoadSvnRepository", "default_interval": timedelta(days=64), "min_interval": timedelta(hours=12), "max_interval": timedelta(days=64), "backoff_factor": 2, "max_queue_length": None, "num_retries": 7, "retry_delay": timedelta(hours=2), } ) # create load-bzr task type swh_scheduler.create_task_type( { "type": "load-bzr", "description": "Update a Bazaar repository", "backend_name": "swh.loader.bzr.tasks.LoadBazaar", "default_interval": timedelta(days=64), "min_interval": timedelta(hours=12), "max_interval": timedelta(days=64), "backoff_factor": 2, "max_queue_length": None, "num_retries": 7, "retry_delay": timedelta(hours=2), } ) # add method to add load-archive-files task type during tests def add_load_archive_task_type(): swh_scheduler.create_task_type( { "type": "load-archive-files", "description": "Load tarballs", "backend_name": "swh.loader.package.archive.tasks.LoadArchive", "default_interval": timedelta(days=64), "min_interval": timedelta(hours=12), "max_interval": timedelta(days=64), "backoff_factor": 2, "max_queue_length": None, "num_retries": 7, "retry_delay": timedelta(hours=2), } ) swh_scheduler.add_load_archive_task_type = add_load_archive_task_type yield swh_scheduler config["scheduler"] = scheduler get_scheduler_load_task_types.cache_clear() @pytest.fixture(scope="session") def django_db_setup(request, django_db_blocker, postgresql_proc): from django.conf import settings settings.DATABASES["default"].update( { ("ENGINE", "django.db.backends.postgresql"), ("NAME", get_config()["test_db"]["name"]), ("USER", postgresql_proc.user), ("HOST", postgresql_proc.host), ("PORT", postgresql_proc.port), } ) with django_db_blocker.unblock(): setup_databases( verbosity=request.config.option.verbose, interactive=False, keepdb=False ) @pytest.fixture def staff_user(): return User.objects.create_user(username="admin", password="", is_staff=True) @pytest.fixture def regular_user(): return User.objects.create_user(username="johndoe", password="") @pytest.fixture def regular_user2(): return User.objects.create_user(username="janedoe", password="") @pytest.fixture def add_forge_moderator(): moderator = User.objects.create_user(username="add-forge moderator", password="") moderator.user_permissions.add( create_django_permission(ADD_FORGE_MODERATOR_PERMISSION) ) return moderator @pytest.fixture def mailmap_admin(): mailmap_admin = User.objects.create_user(username="mailmap-admin", password="") mailmap_admin.user_permissions.add( create_django_permission(MAILMAP_ADMIN_PERMISSION) ) return mailmap_admin @pytest.fixture def mailmap_user(): mailmap_user = User.objects.create_user(username="mailmap-user", password="") mailmap_user.user_permissions.add(create_django_permission(MAILMAP_PERMISSION)) return mailmap_user diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py index c5201a30..8c71908f 100644 --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -1,544 +1,554 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy from datetime import timedelta import os from pathlib import Path import random import time from typing import Dict, List, Optional, Set from swh.core.config import merge_configs from swh.counters import get_counters from swh.indexer.ctags import CtagsIndexer from swh.indexer.fossology_license import FossologyLicenseIndexer from swh.indexer.mimetype import MimetypeIndexer from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.model import OriginIntrinsicMetadataRow from swh.loader.git.from_disk import GitLoaderFromArchive from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_hex from swh.model.model import ( Content, Directory, Origin, OriginVisit, OriginVisitStatus, Snapshot, ) from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.search import get_search from swh.storage import get_storage from swh.storage.algos.dir_iterators import dir_iterator from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import Sha1 from swh.storage.utils import now from swh.web import config from swh.web.browse.utils import ( get_mimetype_and_encoding_for_content, prepare_content_for_display, re_encode_content, ) from swh.web.common import archive # Module used to initialize data that will be provided as tests input # Base content indexer configuration _TEST_INDEXER_BASE_CONFIG = { "storage": {"cls": "memory"}, - "objstorage": {"cls": "memory", "args": {},}, - "indexer_storage": {"cls": "memory", "args": {},}, + "objstorage": { + "cls": "memory", + "args": {}, + }, + "indexer_storage": { + "cls": "memory", + "args": {}, + }, } def random_sha1(): return hash_to_hex(bytes(random.randint(0, 255) for _ in range(20))) def random_sha256(): return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32))) def random_blake2s256(): return hash_to_hex(bytes(random.randint(0, 255) for _ in range(32))) def random_content(): return { "sha1": random_sha1(), "sha1_git": random_sha1(), "sha256": random_sha256(), "blake2s256": random_blake2s256(), } _TEST_MIMETYPE_INDEXER_CONFIG = merge_configs( _TEST_INDEXER_BASE_CONFIG, { "tools": { "name": "file", "version": "1:5.30-1+deb9u1", "configuration": {"type": "library", "debian-package": "python3-magic"}, } }, ) _TEST_LICENSE_INDEXER_CONFIG = merge_configs( _TEST_INDEXER_BASE_CONFIG, { "workdir": "/tmp/swh/indexer.fossology.license", "tools": { "name": "nomos", "version": "3.1.0rc2-31-ga2cbb8c", - "configuration": {"command_line": "nomossa <filepath>",}, + "configuration": { + "command_line": "nomossa <filepath>", + }, }, }, ) _TEST_CTAGS_INDEXER_CONFIG = merge_configs( _TEST_INDEXER_BASE_CONFIG, { "workdir": "/tmp/swh/indexer.ctags", "languages": {"c": "c"}, "tools": { "name": "universal-ctags", "version": "~git7859817b", "configuration": { "command_line": """ctags --fields=+lnz --sort=no --links=no """ """--output-format=json <filepath>""" }, }, }, ) # Lightweight git repositories that will be loaded to generate # input data for tests _TEST_ORIGINS = [ { "type": "git", "url": "https://github.com/memononen/libtess2", "archives": ["libtess2.zip"], "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": ( "Game and tools oriented refactored version of GLU tessellator." ), }, }, { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js", "archives": [ "highlightjs-line-numbers.js.zip", "highlightjs-line-numbers.js_visit2.zip", ], "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "Line numbering plugin for Highlight.js", }, }, { "type": "git", "url": "repo_with_submodules", "archives": ["repo_with_submodules.tgz"], "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "This is just a sample repository with submodules", }, }, ] _contents = {} def _add_extra_contents(storage, contents): pbm_image_data = b"""P1 # PBM example 24 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0""" # add file with mimetype image/x-portable-bitmap in the archive content pbm_content = Content.from_data(pbm_image_data) storage.content_add([pbm_content]) contents.add(pbm_content.sha1) # add file with mimetype application/pgp-keys in the archive content gpg_path = os.path.join( os.path.dirname(__file__), "resources/contents/other/extensions/public.gpg" ) gpg_content = Content.from_data(Path(gpg_path).read_bytes()) storage.content_add([gpg_content]) contents.add(gpg_content.sha1) INDEXER_TOOL = { "tool_name": "swh-web tests", "tool_version": "1.0", "tool_configuration": {}, } ORIGIN_METADATA_KEY = "keywords" ORIGIN_METADATA_VALUE = "git" ORIGIN_MASTER_REVISION = {} def _add_origin( storage, search, counters, origin_url, visit_type="git", snapshot_branches={} ): storage.origin_add([Origin(url=origin_url)]) search.origin_update( [{"url": origin_url, "has_visits": True, "visit_types": [visit_type]}] ) counters.add("origin", [origin_url]) date = now() visit = OriginVisit(origin=origin_url, date=date, type=visit_type) visit = storage.origin_visit_add([visit])[0] counters.add("origin_visit", [f"{visit.unique_key()}"]) snapshot = Snapshot.from_dict({"branches": snapshot_branches}) storage.snapshot_add([snapshot]) counters.add("snapshot", [snapshot.id]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=date + timedelta(minutes=1), type=visit.type, status="full", snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status]) counters.add("origin_visit_status", [f"{visit_status.unique_key()}"]) # Tests data initialization def _init_tests_data(): # To hold reference to the memory storage storage = get_storage("memory") # Create search instance search = get_search("memory") search.initialize() search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS) # create the counters instance counters = get_counters("memory") # Create indexer storage instance that will be shared by indexers idx_storage = get_indexer_storage("memory") # Declare a test tool for origin intrinsic metadata tests idx_tool = idx_storage.indexer_configuration_add([INDEXER_TOOL])[0] INDEXER_TOOL["id"] = idx_tool["id"] # Load git repositories from archives for origin in _TEST_ORIGINS: for i, archive_ in enumerate(origin["archives"]): if i > 0: # ensure visit dates will be different when simulating # multiple visits of an origin time.sleep(1) origin_repo_archive = os.path.join( os.path.dirname(__file__), "resources/repos/%s" % archive_ ) loader = GitLoaderFromArchive( - storage, origin["url"], archive_path=origin_repo_archive, + storage, + origin["url"], + archive_path=origin_repo_archive, ) result = loader.load() assert result["status"] == "eventful" ori = storage.origin_get([origin["url"]])[0] origin.update(ori.to_dict()) # add an 'id' key if enabled search.origin_update( [{"url": origin["url"], "has_visits": True, "visit_types": ["git"]}] ) for i in range(250): _add_origin( storage, search, counters, origin_url=f"https://many.origins/{i+1}", visit_type="tar", ) sha1s: Set[Sha1] = set() directories = set() revisions = set() releases = set() snapshots = set() swhids = [] content_path = {} # Get all objects loaded into the test archive common_metadata = {ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE} for origin in _TEST_ORIGINS: origin_revisions = set() snp = snapshot_get_latest(storage, origin["url"]) swhids.append( QualifiedSWHID( object_type=ObjectType.SNAPSHOT, object_id=snp.id, origin=origin["url"] ) ) snapshots.add(hash_to_hex(snp.id)) for branch_name, branch_data in snp.branches.items(): target_type = branch_data.target_type.value if target_type == "revision": origin_revisions.add(branch_data.target) swhids.append( QualifiedSWHID( object_type=ObjectType.REVISION, object_id=branch_data.target, origin=origin["url"], visit=CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=snp.id ), ) ) if b"master" in branch_name: # Add some origin intrinsic metadata for tests metadata = common_metadata metadata.update(origin.get("metadata", {})) origin_metadata = OriginIntrinsicMetadataRow( id=origin["url"], from_revision=branch_data.target, indexer_configuration_id=idx_tool["id"], metadata=metadata, mappings=[], ) idx_storage.origin_intrinsic_metadata_add([origin_metadata]) search.origin_update( [{"url": origin["url"], "intrinsic_metadata": metadata}] ) ORIGIN_MASTER_REVISION[origin["url"]] = hash_to_hex( branch_data.target ) elif target_type == "release": release = storage.release_get([branch_data.target])[0] origin_revisions.add(release.target) releases.add(hash_to_hex(branch_data.target)) swhids.append( QualifiedSWHID( object_type=ObjectType.RELEASE, object_id=branch_data.target, origin=origin["url"], visit=CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=snp.id ), ) ) for rev_log in storage.revision_shortlog(origin_revisions): rev_id = rev_log[0] revisions.add(rev_id) for rev in storage.revision_get(sorted(origin_revisions)): if rev is None: continue dir_id = rev.directory directories.add(hash_to_hex(dir_id)) for entry in dir_iterator(storage, dir_id): if entry["type"] == "file": sha1s.add(entry["sha1"]) content_path[entry["sha1"]] = "/".join( [hash_to_hex(dir_id), entry["path"].decode("utf-8")] ) swhids.append( QualifiedSWHID( object_type=ObjectType.CONTENT, object_id=entry["sha1_git"], origin=origin["url"], visit=CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=snp.id ), anchor=CoreSWHID( object_type=ObjectType.REVISION, object_id=rev.id ), path=b"/" + entry["path"], ) ) elif entry["type"] == "dir": directories.add(hash_to_hex(entry["target"])) swhids.append( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=entry["target"], origin=origin["url"], visit=CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=snp.id ), anchor=CoreSWHID( object_type=ObjectType.REVISION, object_id=rev.id ), path=b"/" + entry["path"] + b"/", ) ) _add_extra_contents(storage, sha1s) # Get all checksums for each content result: List[Optional[Content]] = storage.content_get(list(sha1s)) contents: List[Dict] = [] for content in result: assert content is not None sha1 = hash_to_hex(content.sha1) content_metadata = { algo: hash_to_hex(getattr(content, algo)) for algo in DEFAULT_ALGORITHMS } path = "" if content.sha1 in content_path: path = content_path[content.sha1] cnt_data = storage.content_get_data(content.sha1) assert cnt_data is not None mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data) _, _, cnt_data = re_encode_content(mimetype, encoding, cnt_data) content_display_data = prepare_content_for_display(cnt_data, mimetype, path) content_metadata.update( { "path": path, "mimetype": mimetype, "encoding": encoding, "hljs_language": content_display_data["language"], "raw_data": cnt_data, "data": content_display_data["content_data"], } ) _contents[sha1] = content_metadata contents.append(content_metadata) # Add the empty directory to the test archive storage.directory_add([Directory(entries=())]) # Add empty content to the test archive storage.content_add([Content.from_data(data=b"")]) # Add fake git origin with pull request branches _add_origin( storage, search, counters, origin_url="https://git.example.org/project", snapshot_branches={ b"refs/heads/master": { "target_type": "revision", "target": next(iter(revisions)), }, **{ f"refs/pull/{i}".encode(): { "target_type": "revision", "target": next(iter(revisions)), } for i in range(300) }, }, ) counters.add("revision", revisions) counters.add("release", releases) counters.add("directory", directories) counters.add("content", [content["sha1"] for content in contents]) # Return tests data return { "search": search, "storage": storage, "idx_storage": idx_storage, "counters": counters, "origins": _TEST_ORIGINS, "contents": list(sorted(contents, key=lambda c: c["sha1"])), "directories": list(sorted(directories)), "releases": list(sorted(releases)), "revisions": list(sorted(map(hash_to_hex, revisions))), "snapshots": list(sorted(snapshots)), "swhids": swhids, } def _init_indexers(tests_data): # Instantiate content indexers that will be used in tests # and force them to use the memory storages indexers = {} for idx_name, idx_class, idx_config in ( ("mimetype_indexer", MimetypeIndexer, _TEST_MIMETYPE_INDEXER_CONFIG), ("license_indexer", FossologyLicenseIndexer, _TEST_LICENSE_INDEXER_CONFIG), ("ctags_indexer", CtagsIndexer, _TEST_CTAGS_INDEXER_CONFIG), ): idx = idx_class(config=idx_config) idx.storage = tests_data["storage"] idx.objstorage = tests_data["storage"].objstorage idx.idx_storage = tests_data["idx_storage"] idx.register_tools(idx.config["tools"]) indexers[idx_name] = idx return indexers def get_content(content_sha1): return _contents.get(content_sha1) _tests_data = None _current_tests_data = None _indexer_loggers = {} def get_tests_data(reset=False): """ Initialize tests data and return them in a dict. """ global _tests_data, _current_tests_data if _tests_data is None: _tests_data = _init_tests_data() indexers = _init_indexers(_tests_data) for (name, idx) in indexers.items(): # pytest makes the loggers use a temporary file; and deepcopy # requires serializability. So we remove them, and add them # back after the copy. _indexer_loggers[name] = idx.log del idx.log _tests_data.update(indexers) if reset or _current_tests_data is None: _current_tests_data = deepcopy(_tests_data) for (name, logger) in _indexer_loggers.items(): _current_tests_data[name].log = logger return _current_tests_data def override_storages(storage, idx_storage, search, counters): """ Helper function to replace the storages from which archive data are fetched. """ swh_config = config.get_config() swh_config.update( { "storage": storage, "indexer_storage": idx_storage, "search": search, "counters": counters, } ) archive.storage = storage archive.idx_storage = idx_storage archive.search = search archive.counters = counters diff --git a/swh/web/tests/inbound_email/test_utils.py b/swh/web/tests/inbound_email/test_utils.py index 9a1ed6e5..e82106e7 100644 --- a/swh/web/tests/inbound_email/test_utils.py +++ b/swh/web/tests/inbound_email/test_utils.py @@ -1,298 +1,299 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import email from email.headerregistry import Address from email.message import EmailMessage import email.policy from importlib.resources import open_binary from typing import List import pytest from swh.web.inbound_email import utils def test_extract_recipients(): message = EmailMessage() assert utils.extract_recipients(message) == [] message["To"] = "Test Recipient <test-recipient@example.com>" assert utils.extract_recipients(message) == [ Address(display_name="Test Recipient", addr_spec="test-recipient@example.com") ] message["Cc"] = ( "test-recipient-2@example.com, " "Another Test Recipient <test-recipient-3@example.com>" ) assert utils.extract_recipients(message) == [ Address(display_name="Test Recipient", addr_spec="test-recipient@example.com"), Address(addr_spec="test-recipient-2@example.com"), Address( display_name="Another Test Recipient", addr_spec="test-recipient-3@example.com", ), ] del message["To"] assert utils.extract_recipients(message) == [ Address(addr_spec="test-recipient-2@example.com"), Address( display_name="Another Test Recipient", addr_spec="test-recipient-3@example.com", ), ] def test_single_recipient_matches(): assert ( utils.single_recipient_matches( Address(addr_spec="test@example.com"), "match@example.com" ) is None ) assert utils.single_recipient_matches( Address(addr_spec="match@example.com"), "match@example.com" ) == utils.AddressMatch( recipient=Address(addr_spec="match@example.com"), extension=None ) assert utils.single_recipient_matches( Address(addr_spec="MaTch+12345AbC@exaMple.Com"), "match@example.com" ) == utils.AddressMatch( recipient=Address(addr_spec="MaTch+12345AbC@exaMple.Com"), extension="12345AbC" ) def test_recipient_matches(): message = EmailMessage() assert utils.recipient_matches(message, "match@example.com") == [] message = EmailMessage() message["to"] = "nomatch@example.com" assert utils.recipient_matches(message, "match@example.com") == [] message = EmailMessage() message["to"] = "match@example.com" assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match@example.com"), extension=None ) ] message = EmailMessage() message["to"] = "match+extension@example.com" assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match+extension@example.com"), extension="extension", ) ] message = EmailMessage() message["to"] = "match+weird+plussed+extension@example.com" assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match+weird+plussed+extension@example.com"), extension="weird+plussed+extension", ) ] message = EmailMessage() message["to"] = "nomatch@example.com" message["cc"] = ", ".join( ( "match@example.com", "match@notamatch.example.com", "Another Match <match+extension@example.com>", ) ) assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( - recipient=Address(addr_spec="match@example.com"), extension=None, + recipient=Address(addr_spec="match@example.com"), + extension=None, ), utils.AddressMatch( recipient=Address( display_name="Another Match", addr_spec="match+extension@example.com" ), extension="extension", ), ] def test_recipient_matches_casemapping(): message = EmailMessage() message["to"] = "match@example.com" assert utils.recipient_matches(message, "Match@Example.Com") assert utils.recipient_matches(message, "match@example.com") message = EmailMessage() message["to"] = "Match+weirdCaseMapping@Example.Com" matches = utils.recipient_matches(message, "match@example.com") assert matches assert matches[0].extension == "weirdCaseMapping" def test_get_address_for_pk(): salt = "test_salt" pks = [1, 10, 1000] base_address = "base@example.com" addresses = { pk: utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for pk in pks } assert len(set(addresses.values())) == len(addresses) for pk, address in addresses.items(): localpart, _, domain = address.partition("@") base_localpart, _, extension = localpart.partition("+") assert domain == "example.com" assert base_localpart == "base" assert extension.startswith(f"{pk}.") def test_get_address_for_pk_salt(): pk = 1000 base_address = "base@example.com" addresses = [ utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for salt in ["salt1", "salt2"] ] assert len(addresses) == len(set(addresses)) def test_get_pks_from_message(): salt = "test_salt" pks = [1, 10, 1000] base_address = "base@example.com" addresses = { pk: utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for pk in pks } message = EmailMessage() message["To"] = "test@example.com" assert utils.get_pks_from_message(salt, base_address, message) == set() message = EmailMessage() message["To"] = f"Test Address <{addresses[1]}>" assert utils.get_pks_from_message(salt, base_address, message) == {1} message = EmailMessage() message["To"] = f"Test Address <{addresses[1]}>" message["Cc"] = ", ".join( [ f"Test Address <{addresses[1]}>", f"Another Test Address <{addresses[10].lower()}>", "A Third Address <irrelevant@example.com>", ] ) assert utils.get_pks_from_message(salt, base_address, message) == {1, 10} def test_get_pks_from_message_logging(caplog): salt = "test_salt" pks = [1, 10, 1000] base_address = "base@example.com" addresses = { pk: utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for pk in pks } message = EmailMessage() message["To"] = f"Test Address <{base_address}>" assert utils.get_pks_from_message(salt, base_address, message) == set() relevant_records = [ record for record in caplog.records if record.name == "swh.web.inbound_email.utils" ] assert len(relevant_records) == 1 assert relevant_records[0].levelname == "DEBUG" assert ( f"{base_address} cannot be matched to a request" in relevant_records[0].getMessage() ) # Replace the signature with "mangle{signature}" mangled_address = addresses[1].replace(".", ".mangle", 1) message = EmailMessage() message["To"] = f"Test Address <{mangled_address}>" assert utils.get_pks_from_message(salt, base_address, message) == set() relevant_records = [ record for record in caplog.records if record.name == "swh.web.inbound_email.utils" ] assert len(relevant_records) == 2 assert relevant_records[0].levelname == "DEBUG" assert relevant_records[1].levelname == "DEBUG" assert f"{mangled_address} failed" in relevant_records[1].getMessage() @pytest.mark.parametrize( "filename,expected_parts,expected_absent", ( pytest.param( "plaintext.eml", [b"Plain text email.\n\n-- \nTest User"], [], id="plaintext", ), pytest.param( "multipart.eml", [b"*Multipart email.*\n\n-- \nTest User"], [], id="multipart", ), pytest.param( "multipart_html_only.eml", [b"<html>", b"<b>Multipart email (a much longer html part).</b>"], [b"<b>Multipart email (short html part)</b>"], id="multipart_html_only", ), pytest.param( "multipart_text_only.eml", [b"*Multipart email, but a longer text part.*\n\n--\nTest User"], [], id="multipart_text_only", ), ), ) def test_get_message_plaintext( filename: str, expected_parts: List[bytes], expected_absent: List[bytes] ): with open_binary("swh.web.tests.inbound_email.resources", filename) as f: message = email.message_from_binary_file(f, policy=email.policy.default) assert isinstance(message, EmailMessage) plaintext = utils.get_message_plaintext(message) assert plaintext is not None if len(expected_parts) == 1: assert plaintext == expected_parts[0] else: for part in expected_parts: assert part in plaintext for part in expected_absent: assert part not in plaintext diff --git a/swh/web/tests/misc/test_badges.py b/swh/web/tests/misc/test_badges.py index c8fc7c02..a5ec1d12 100644 --- a/swh/web/tests/misc/test_badges.py +++ b/swh/web/tests/misc/test_badges.py @@ -1,179 +1,181 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from corsheaders.middleware import ACCESS_CONTROL_ALLOW_ORIGIN from hypothesis import given from swh.model.hashutil import hash_to_bytes from swh.model.swhids import ObjectType, QualifiedSWHID from swh.web.common import archive from swh.web.common.identifiers import resolve_swhid from swh.web.common.utils import reverse from swh.web.misc.badges import _badge_config, _get_logo_data from swh.web.tests.django_asserts import assert_contains from swh.web.tests.strategies import new_origin from swh.web.tests.utils import check_http_get_response def test_content_badge(client, content): _test_badge_endpoints(client, "content", content["sha1_git"]) def test_directory_badge(client, directory): _test_badge_endpoints(client, "directory", directory) def test_origin_badge(client, origin): _test_badge_endpoints(client, "origin", origin["url"]) def test_release_badge(client, release): _test_badge_endpoints(client, "release", release) def test_revision_badge(client, revision): _test_badge_endpoints(client, "revision", revision) def test_snapshot_badge(client, snapshot): _test_badge_endpoints(client, "snapshot", snapshot) -@given(new_origin(),) +@given( + new_origin(), +) def test_badge_errors( client, unknown_content, unknown_directory, unknown_release, unknown_revision, unknown_snapshot, invalid_sha1, new_origin, ): for object_type, object_id in ( ("content", unknown_content["sha1_git"]), ("directory", unknown_directory), ("origin", new_origin), ("release", unknown_release), ("revision", unknown_revision), ("snapshot", unknown_snapshot), ): url_args = {"object_type": object_type, "object_id": object_id} url = reverse("swh-badge", url_args=url_args) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml" ) _check_generated_badge(resp, **url_args, error="not found") for object_type, object_id in ( (ObjectType.CONTENT, invalid_sha1), (ObjectType.DIRECTORY, invalid_sha1), (ObjectType.RELEASE, invalid_sha1), (ObjectType.REVISION, invalid_sha1), (ObjectType.SNAPSHOT, invalid_sha1), ): url_args = {"object_type": object_type.name.lower(), "object_id": object_id} url = reverse("swh-badge", url_args=url_args) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml" ) _check_generated_badge(resp, **url_args, error="invalid id") object_swhid = f"swh:1:{object_type.value}:{object_id}" url = reverse("swh-badge-swhid", url_args={"object_swhid": object_swhid}) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml" ) _check_generated_badge(resp, "", "", error="invalid id") def test_badge_endpoints_have_cors_header(client, origin, release): url = reverse( "swh-badge", url_args={"object_type": "origin", "object_id": origin["url"]} ) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml", http_origin="https://example.org", ) assert ACCESS_CONTROL_ALLOW_ORIGIN in resp release_swhid = str( QualifiedSWHID(object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release)) ) url = reverse("swh-badge-swhid", url_args={"object_swhid": release_swhid}) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml", http_origin="https://example.org", ) assert ACCESS_CONTROL_ALLOW_ORIGIN in resp def _test_badge_endpoints(client, object_type: str, object_id: str): url_args = {"object_type": object_type, "object_id": object_id} url = reverse("swh-badge", url_args=url_args) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml" ) _check_generated_badge(resp, **url_args) if object_type != "origin": obj_swhid = str( QualifiedSWHID( object_type=ObjectType[object_type.upper()], object_id=hash_to_bytes(object_id), ) ) url = reverse("swh-badge-swhid", url_args={"object_swhid": obj_swhid}) resp = check_http_get_response( client, url, status_code=200, content_type="image/svg+xml" ) _check_generated_badge(resp, **url_args) def _check_generated_badge(response, object_type, object_id, error=None): assert response.status_code == 200, response.content assert response["Content-Type"] == "image/svg+xml" if not object_type: object_type = "object" if object_type == "origin" and error is None: link = reverse("browse-origin", query_params={"origin_url": object_id}) text = "repository" elif error is None: text = str( QualifiedSWHID( object_type=ObjectType[object_type.upper()], object_id=hash_to_bytes(object_id), ) ) link = resolve_swhid(text)["browse_url"] if object_type == "release": release = archive.lookup_release(object_id) text = release["name"] elif error == "invalid id": text = "error" link = f"invalid {object_type} id" object_type = "error" elif error == "not found": text = "error" link = f"{object_type} not found" object_type = "error" assert_contains(response, "<svg ") assert_contains(response, "</svg>") assert_contains(response, _get_logo_data()) assert_contains(response, _badge_config[object_type]["color"]) assert_contains(response, _badge_config[object_type]["title"]) assert_contains(response, text) assert_contains(response, link) diff --git a/swh/web/tests/misc/test_metrics.py b/swh/web/tests/misc/test_metrics.py index 995ed451..0d62b7b8 100644 --- a/swh/web/tests/misc/test_metrics.py +++ b/swh/web/tests/misc/test_metrics.py @@ -1,131 +1,137 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta from itertools import product import random from prometheus_client.exposition import CONTENT_TYPE_LATEST import pytest from swh.web.common.models import ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_PENDING, SAVE_REQUEST_REJECTED, SAVE_TASK_FAILED, SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SaveOriginRequest, ) from swh.web.common.origin_save import ( ACCEPTED_SAVE_REQUESTS_DELAY_METRIC, ACCEPTED_SAVE_REQUESTS_METRIC, SUBMITTED_SAVE_REQUESTS_METRIC, get_savable_visit_types, ) from swh.web.common.utils import reverse from swh.web.tests.django_asserts import assert_contains from swh.web.tests.utils import check_http_get_response @pytest.mark.django_db def test_origin_save_metrics(client, swh_scheduler): visit_types = get_savable_visit_types() request_statuses = ( SAVE_REQUEST_ACCEPTED, SAVE_REQUEST_REJECTED, SAVE_REQUEST_PENDING, ) load_task_statuses = ( SAVE_TASK_NOT_CREATED, SAVE_TASK_NOT_YET_SCHEDULED, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED, SAVE_TASK_RUNNING, ) for _ in range(random.randint(50, 100)): visit_type = random.choice(visit_types) request_satus = random.choice(request_statuses) load_task_status = random.choice(load_task_statuses) sor = SaveOriginRequest.objects.create( origin_url="origin", visit_type=visit_type, status=request_satus, loading_task_status=load_task_status, ) if load_task_status in (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED): delay = random.choice(range(60)) sor.visit_date = sor.request_date + timedelta(seconds=delay) sor.save() # Note that this injects dates in the future for the sake of the test only url = reverse("metrics-prometheus") resp = check_http_get_response( client, url, status_code=200, content_type=CONTENT_TYPE_LATEST ) accepted_requests = SaveOriginRequest.objects.filter(status=SAVE_REQUEST_ACCEPTED) labels_set = product(visit_types, load_task_statuses) for labels in labels_set: sor_count = accepted_requests.filter( visit_type=labels[0], loading_task_status=labels[1] ).count() metric_text = ( f"{ACCEPTED_SAVE_REQUESTS_METRIC}{{" f'load_task_status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(sor_count)}\n' ) assert_contains(resp, metric_text) labels_set = product(visit_types, request_statuses) for labels in labels_set: sor_count = SaveOriginRequest.objects.filter( visit_type=labels[0], status=labels[1] ).count() metric_text = ( f"{SUBMITTED_SAVE_REQUESTS_METRIC}{{" f'status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(sor_count)}\n' ) assert_contains(resp, metric_text) # delay metrics save_requests = SaveOriginRequest.objects.all() - labels_set = product(visit_types, (SAVE_TASK_SUCCEEDED, SAVE_TASK_FAILED,)) + labels_set = product( + visit_types, + ( + SAVE_TASK_SUCCEEDED, + SAVE_TASK_FAILED, + ), + ) for labels in labels_set: sors = save_requests.filter( visit_type=labels[0], loading_task_status=labels[1], visit_date__isnull=False, ) delay = 0 for sor in sors: delay += sor.visit_date.timestamp() - sor.request_date.timestamp() metric_delay_text = ( f"{ACCEPTED_SAVE_REQUESTS_DELAY_METRIC}{{" f'load_task_status="{labels[1]}",' f'visit_type="{labels[0]}"}} {float(delay)}\n' ) assert_contains(resp, metric_delay_text) diff --git a/swh/web/tests/strategies.py b/swh/web/tests/strategies.py index a57a518f..a8dccc8d 100644 --- a/swh/web/tests/strategies.py +++ b/swh/web/tests/strategies.py @@ -1,91 +1,88 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime from hypothesis.extra.dateutil import timezones from hypothesis.strategies import composite, datetimes, lists, text from swh.model.hypothesis_strategies import origins as new_origin_strategy from swh.model.hypothesis_strategies import persons as new_person_strategy from swh.model.hypothesis_strategies import sha1_git from swh.model.hypothesis_strategies import snapshots as new_snapshot from swh.model.model import Revision, RevisionType, TimestampWithTimezone # Module dedicated to the generation of input data for tests through # the use of hypothesis. def new_origin(): """Hypothesis strategy returning a random origin not ingested into the test archive. """ return new_origin_strategy() def visit_dates(nb_dates=None): - """Hypothesis strategy returning a list of visit dates. - """ + """Hypothesis strategy returning a list of visit dates.""" min_size = nb_dates if nb_dates else 2 max_size = nb_dates if nb_dates else 8 return lists( datetimes( min_value=datetime(2015, 1, 1, 0, 0), max_value=datetime.now(), timezones=timezones(), ), min_size=min_size, max_size=max_size, unique=True, ).map(sorted) def new_person(): - """Hypothesis strategy returning random raw swh person data. - """ + """Hypothesis strategy returning random raw swh person data.""" return new_person_strategy() @composite def new_swh_date(draw): - """Hypothesis strategy returning random raw swh date data. - """ + """Hypothesis strategy returning random raw swh date data.""" timestamp = draw( datetimes(min_value=datetime(2015, 1, 1, 0, 0), max_value=datetime.now()).map( lambda d: int(d.timestamp()) ) ) return { "timestamp": timestamp, "offset": 0, "negative_utc": False, } @composite def new_revision(draw): """Hypothesis strategy returning random raw swh revision data not ingested into the test archive. """ return Revision( directory=draw(sha1_git()), author=draw(new_person()), committer=draw(new_person()), message=draw(text(min_size=20, max_size=100).map(lambda t: t.encode())), date=TimestampWithTimezone.from_datetime(draw(new_swh_date())), committer_date=TimestampWithTimezone.from_datetime(draw(new_swh_date())), synthetic=False, type=RevisionType.GIT, ) def new_snapshots(nb_snapshots=None): min_size = nb_snapshots if nb_snapshots else 2 max_size = nb_snapshots if nb_snapshots else 8 return lists( new_snapshot(min_size=2, max_size=10, only_objects=True), min_size=min_size, max_size=max_size, ) diff --git a/swh/web/tests/test_config.py b/swh/web/tests/test_config.py index fed25f62..571b3be0 100644 --- a/swh/web/tests/test_config.py +++ b/swh/web/tests/test_config.py @@ -1,23 +1,24 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.web.config import get_config, is_feature_enabled @pytest.mark.parametrize( - "feature_name", ["inexistant-feature", "awesome-stuff"], + "feature_name", + ["inexistant-feature", "awesome-stuff"], ) def test_is_feature_enabled(feature_name): config = get_config() # by default, feature non configured are considered disabled assert is_feature_enabled(feature_name) is False for enabled in [True, False]: # Let's configure the feature config["features"] = {feature_name: enabled} # and check its configuration is properly read assert is_feature_enabled(feature_name) is enabled diff --git a/swh/web/tests/test_migrations.py b/swh/web/tests/test_migrations.py index 572d77f2..88edf1d3 100644 --- a/swh/web/tests/test_migrations.py +++ b/swh/web/tests/test_migrations.py @@ -1,54 +1,60 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information APP_NAME = "swh_web_common" MIGRATION_0008 = "0008_save-code-now_indexes_20210106_1327" MIGRATION_0009 = "0009_saveoriginrequest_visit_status" MIGRATION_0010 = "0010_saveoriginrequest_user_id" MIGRATION_0011 = "0011_saveoriginrequest_user_ids" MIGRATION_0012 = "0012_saveoriginrequest_note" def test_migrations_09_add_visit_status_to_sor_model(migrator): """Ensures the migration adds the visit_status field to SaveOriginRequest table""" - old_state = migrator.apply_initial_migration((APP_NAME, MIGRATION_0008),) + old_state = migrator.apply_initial_migration( + (APP_NAME, MIGRATION_0008), + ) old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(old_model, "visit_status") is False new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0009)) new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(new_model, "visit_status") is True def test_migrations_10_add_user_id_to_sor_model(migrator): """Ensures the migration adds the user_id field to SaveOriginRequest table""" - old_state = migrator.apply_initial_migration((APP_NAME, MIGRATION_0009),) + old_state = migrator.apply_initial_migration( + (APP_NAME, MIGRATION_0009), + ) old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(old_model, "user_id") is False new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0010)) new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(new_model, "user_id") is True def test_migrations_12_add_note_to_sor_model(migrator): """Ensures the migration adds the user_id field to SaveOriginRequest table""" - old_state = migrator.apply_initial_migration((APP_NAME, MIGRATION_0011),) + old_state = migrator.apply_initial_migration( + (APP_NAME, MIGRATION_0011), + ) old_model = old_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(old_model, "note") is False new_state = migrator.apply_tested_migration((APP_NAME, MIGRATION_0012)) new_model = new_state.apps.get_model(APP_NAME, "SaveOriginRequest") assert hasattr(new_model, "note") is True diff --git a/swh/web/tests/test_templates.py b/swh/web/tests/test_templates.py index 10f53ac2..6aa3d47b 100644 --- a/swh/web/tests/test_templates.py +++ b/swh/web/tests/test_templates.py @@ -1,96 +1,99 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy import random from pkg_resources import get_distribution import pytest from swh.web.auth.utils import ADMIN_LIST_DEPOSIT_PERMISSION from swh.web.common.utils import reverse from swh.web.config import SWH_WEB_SERVER_NAME, SWH_WEB_STAGING_SERVER_NAMES, get_config from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.utils import check_http_get_response, create_django_permission swh_web_version = get_distribution("swh.web").version def test_layout_without_ribbon(client): url = reverse("swh-web-homepage") resp = check_http_get_response( client, url, status_code=200, server_name=SWH_WEB_SERVER_NAME ) assert_not_contains(resp, "swh-corner-ribbon") def test_layout_with_staging_ribbon(client): url = reverse("swh-web-homepage") resp = check_http_get_response( client, url, status_code=200, server_name=random.choice(SWH_WEB_STAGING_SERVER_NAMES), ) assert_contains(resp, "swh-corner-ribbon") assert_contains(resp, f"Staging<br/>v{swh_web_version}") def test_layout_with_development_ribbon(client): url = reverse("swh-web-homepage") resp = check_http_get_response( - client, url, status_code=200, server_name="localhost", + client, + url, + status_code=200, + server_name="localhost", ) assert_contains(resp, "swh-corner-ribbon") assert_contains(resp, f"Development<br/>v{swh_web_version.split('+')[0]}") def test_layout_with_oidc_auth_enabled(client): url = reverse("swh-web-homepage") resp = check_http_get_response(client, url, status_code=200) assert_contains(resp, reverse("oidc-login")) def test_layout_without_oidc_auth_enabled(client, mocker): config = deepcopy(get_config()) config["keycloak"]["server_url"] = "" mock_get_config = mocker.patch("swh.web.common.utils.get_config") mock_get_config.return_value = config url = reverse("swh-web-homepage") resp = check_http_get_response(client, url, status_code=200) assert_contains(resp, reverse("login")) def test_layout_swh_web_version_number_display(client): url = reverse("swh-web-homepage") resp = check_http_get_response(client, url, status_code=200) assert_contains(resp, f"swh-web v{swh_web_version}") @pytest.mark.django_db def test_layout_no_deposit_admin_for_anonymous_user(client): url = reverse("swh-web-homepage") resp = check_http_get_response(client, url, status_code=200) assert_not_contains(resp, "swh-deposit-admin-link") @pytest.mark.django_db def test_layout_deposit_admin_for_staff_user(client, staff_user): client.force_login(staff_user) url = reverse("swh-web-homepage") resp = check_http_get_response(client, url, status_code=200) assert_contains(resp, "swh-deposit-admin-link") @pytest.mark.django_db def test_layout_deposit_admin_for_user_with_permission(client, regular_user): regular_user.user_permissions.add( create_django_permission(ADMIN_LIST_DEPOSIT_PERMISSION) ) client.force_login(regular_user) url = reverse("swh-web-homepage") resp = check_http_get_response(client, url, status_code=200) assert_contains(resp, "swh-deposit-admin-link") diff --git a/swh/web/tests/utils.py b/swh/web/tests/utils.py index a07c5755..b23d9769 100644 --- a/swh/web/tests/utils.py +++ b/swh/web/tests/utils.py @@ -1,241 +1,244 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Optional, cast from django.contrib.auth.models import Permission from django.contrib.contenttypes.models import ContentType from django.http import HttpResponse, StreamingHttpResponse from django.test.client import Client from rest_framework.response import Response from rest_framework.test import APIClient from swh.web.tests.django_asserts import assert_template_used def _assert_http_response( response: HttpResponse, status_code: int, content_type: str ) -> HttpResponse: if isinstance(response, Response): drf_response = cast(Response, response) error_context = ( drf_response.data.pop("traceback") if isinstance(drf_response.data, dict) and "traceback" in drf_response.data else drf_response.data ) elif isinstance(response, StreamingHttpResponse): error_context = getattr(response, "traceback", response.streaming_content) else: error_context = getattr(response, "traceback", response.content) assert response.status_code == status_code, error_context if content_type != "*/*": assert response["Content-Type"].startswith(content_type) return response def check_http_get_response( client: Client, url: str, status_code: int, content_type: str = "*/*", http_origin: Optional[str] = None, server_name: Optional[str] = None, ) -> HttpResponse: """Helper function to check HTTP response for a GET request. Args: client: Django test client url: URL to check response status_code: expected HTTP status code content_type: expected response content type http_origin: optional HTTP_ORIGIN header value Returns: The HTTP response """ return _assert_http_response( response=client.get( url, HTTP_ACCEPT=content_type, HTTP_ORIGIN=http_origin, SERVER_NAME=server_name if server_name else "testserver", ), status_code=status_code, content_type=content_type, ) def check_http_post_response( client: Client, url: str, status_code: int, content_type: str = "*/*", request_content_type="application/json", data: Optional[Dict[str, Any]] = None, http_origin: Optional[str] = None, ) -> HttpResponse: """Helper function to check HTTP response for a POST request. Args: client: Django test client url: URL to check response status_code: expected HTTP status code content_type: expected response content type request_content_type: content type of request body data: optional POST data Returns: The HTTP response """ return _assert_http_response( response=client.post( url, data=data, content_type=request_content_type, HTTP_ACCEPT=content_type, HTTP_ORIGIN=http_origin, ), status_code=status_code, content_type=content_type, ) def check_api_get_responses( api_client: APIClient, url: str, status_code: int ) -> Response: """Helper function to check Web API responses for GET requests for all accepted content types (JSON, YAML, HTML). Args: api_client: DRF test client url: Web API URL to check responses status_code: expected HTTP status code Returns: The Web API JSON response """ # check JSON response response_json = check_http_get_response( api_client, url, status_code, content_type="application/json" ) # check HTML response (API Web UI) check_http_get_response(api_client, url, status_code, content_type="text/html") # check YAML response check_http_get_response( api_client, url, status_code, content_type="application/yaml" ) return cast(Response, response_json) def check_api_post_response( api_client: APIClient, url: str, status_code: int, content_type: str = "*/*", data: Optional[Dict[str, Any]] = None, ) -> HttpResponse: """Helper function to check Web API response for a POST request for all accepted content types. Args: api_client: DRF test client url: Web API URL to check response status_code: expected HTTP status code Returns: The HTTP response """ return _assert_http_response( response=api_client.post( - url, data=data, format="json", HTTP_ACCEPT=content_type, + url, + data=data, + format="json", + HTTP_ACCEPT=content_type, ), status_code=status_code, content_type=content_type, ) def check_api_post_responses( api_client: APIClient, url: str, status_code: int, data: Optional[Dict[str, Any]] = None, ) -> Response: """Helper function to check Web API responses for POST requests for all accepted content types (JSON, YAML). Args: api_client: DRF test client url: Web API URL to check responses status_code: expected HTTP status code Returns: The Web API JSON response """ # check JSON response response_json = check_api_post_response( api_client, url, status_code, content_type="application/json", data=data ) # check YAML response check_api_post_response( api_client, url, status_code, content_type="application/yaml", data=data ) return cast(Response, response_json) def check_html_get_response( client: Client, url: str, status_code: int, template_used: Optional[str] = None ) -> HttpResponse: """Helper function to check HTML responses for a GET request. Args: client: Django test client url: URL to check responses status_code: expected HTTP status code template_used: optional used Django template to check Returns: The HTML response """ response = check_http_get_response( client, url, status_code, content_type="text/html" ) if template_used is not None: assert_template_used(response, template_used) return response def create_django_permission(perm_name: str) -> Permission: """Create permission out of a permission name string Args: perm_name: Permission name (e.g. swh.web.api.throttling_exempted, swh.ambassador, ...) Returns: The persisted permission """ perm_splitted = perm_name.split(".") app_label = ".".join(perm_splitted[:-1]) perm_name = perm_splitted[-1] content_type = ContentType.objects.create( id=1000 + ContentType.objects.count(), app_label=app_label, model=perm_splitted[-1], ) return Permission.objects.create( codename=perm_name, name=perm_name, content_type=content_type, id=1000 + Permission.objects.count(), )