diff --git a/swh/web/api/views/add_forge_now.py b/swh/web/api/views/add_forge_now.py index be649272..b388ddeb 100644 --- a/swh/web/api/views/add_forge_now.py +++ b/swh/web/api/views/add_forge_now.py @@ -1,410 +1,413 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json from typing import Any, Dict, Union from django.core.exceptions import ObjectDoesNotExist from django.core.paginator import Paginator from django.db import transaction from django.db.models.query import QuerySet from django.forms import CharField, ModelForm from django.http import HttpResponseBadRequest from django.http.request import HttpRequest from django.http.response import HttpResponse, HttpResponseForbidden from rest_framework import serializers from rest_framework.request import Request from rest_framework.response import Response from swh.web.add_forge_now.models import Request as AddForgeRequest from swh.web.add_forge_now.models import RequestActorRole as AddForgeNowRequestActorRole from swh.web.add_forge_now.models import RequestHistory as AddForgeNowRequestHistory from swh.web.add_forge_now.models import RequestStatus as AddForgeNowRequestStatus from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.auth.utils import is_add_forge_now_moderator from swh.web.common.exc import BadInputExc from swh.web.common.utils import reverse def _block_while_testing(): """Replaced by tests to check concurrency behavior""" pass class AddForgeNowRequestForm(ModelForm): forge_contact_comment = CharField( required=False, ) class Meta: model = AddForgeRequest fields = ( "forge_type", "forge_url", "forge_contact_email", "forge_contact_name", "forge_contact_comment", "submitter_forward_username", ) class AddForgeNowRequestHistoryForm(ModelForm): new_status = CharField( max_length=200, required=False, ) class Meta: model = AddForgeNowRequestHistory fields = ("text", "new_status") class AddForgeNowRequestSerializer(serializers.ModelSerializer): inbound_email_address = serializers.CharField() forge_domain = serializers.CharField() last_moderator = serializers.SerializerMethodField() last_modified_date = serializers.SerializerMethodField() history: Dict[int, QuerySet] = {} class Meta: model = AddForgeRequest fields = "__all__" def _gethistory(self, request): if request.id not in self.history: self.history[request.id] = AddForgeNowRequestHistory.objects.filter( request=request ).order_by("id") return self.history[request.id] def get_last_moderator(self, request): last_history_with_moderator = ( self._gethistory(request).filter(actor_role="MODERATOR").last() ) return ( last_history_with_moderator.actor if last_history_with_moderator else "None" ) def get_last_modified_date(self, request): last_history = self._gethistory(request).last() return ( last_history.date.isoformat().replace("+00:00", "Z") if last_history else None ) class AddForgeNowRequestPublicSerializer(serializers.ModelSerializer): """Serializes AddForgeRequest without private fields.""" class Meta: model = AddForgeRequest fields = ("id", "forge_url", "forge_type", "status", "submission_date") class AddForgeNowRequestHistorySerializer(serializers.ModelSerializer): message_source_url = serializers.SerializerMethodField() class Meta: model = AddForgeNowRequestHistory exclude = ("request", "message_source") def get_message_source_url(self, request_history): if request_history.message_source is None: return None return reverse( "forge-add-message-source", url_args={"id": request_history.pk}, request=self.context["request"], ) class AddForgeNowRequestHistoryPublicSerializer(serializers.ModelSerializer): class Meta: model = AddForgeNowRequestHistory fields = ("id", "date", "new_status", "actor_role") @api_route( r"/add-forge/request/create/", "api-1-add-forge-request-create", methods=["POST"], ) @api_doc("/add-forge/request/create") @format_docstring() @transaction.atomic def api_add_forge_request_create(request: Union[HttpRequest, Request]) -> HttpResponse: """ .. http:post:: /api/1/add-forge/request/create/ Create a new request to add a forge to the list of those crawled regularly by Software Heritage. .. warning:: That endpoint is not publicly available and requires authentication in order to be able to request it. {common_headers} :[0-9]+)/update/", "api-1-add-forge-request-update", methods=["POST"], ) @api_doc("/add-forge/request/update", tags=["hidden"]) @format_docstring() @transaction.atomic def api_add_forge_request_update( request: Union[HttpRequest, Request], id: int ) -> HttpResponse: """ .. http:post:: /api/1/add-forge/request/update/ Update a request to add a forge to the list of those crawled regularly by Software Heritage. .. warning:: That endpoint is not publicly available and requires authentication in order to be able to request it. {common_headers} :[0-9]+)/get/", "api-1-add-forge-request-get", methods=["GET"], ) @api_doc("/add-forge/request/get") @format_docstring() def api_add_forge_request_get(request: Request, id: int): """ .. http:get:: /api/1/add-forge/request/get/ Return all details about an add-forge request. {common_headers} :param int id: add-forge request identifier :statuscode 200: request details successfully returned :statuscode 400: request identifier does not exist """ try: add_forge_request = AddForgeRequest.objects.get(id=id) except ObjectDoesNotExist: raise BadInputExc("Request id does not exist") request_history = AddForgeNowRequestHistory.objects.filter( request=add_forge_request ).order_by("id") if is_add_forge_now_moderator(request.user): data = AddForgeNowRequestSerializer(add_forge_request).data history = AddForgeNowRequestHistorySerializer( request_history, many=True, context={"request": request} ).data else: data = AddForgeNowRequestPublicSerializer(add_forge_request).data history = AddForgeNowRequestHistoryPublicSerializer( request_history, many=True ).data return {"request": data, "history": history} diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py index 4b89970f..a8610f11 100644 --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -1,500 +1,500 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.util import strtobool from functools import partial from typing import Dict from rest_framework.request import Request from swh.search.exc import SearchQuerySyntaxError from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.utils import ( enrich_origin, enrich_origin_search_result, enrich_origin_visit, ) from swh.web.api.views.utils import api_lookup from swh.web.common import archive from swh.web.common.exc import BadInputExc from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import origin_visit_types, reverse DOC_RETURN_ORIGIN = """ :>json string origin_visits_url: link to in order to get information about the visits for that origin :>json string url: the origin canonical url """ DOC_RETURN_ORIGIN_ARRAY = DOC_RETURN_ORIGIN.replace(":>json", ":>jsonarr") DOC_RETURN_ORIGIN_VISIT = """ :>json string date: ISO8601/RFC3339 representation of the visit date (in UTC) :>json str origin: the origin canonical url :>json string origin_url: link to get information about the origin :>jsonarr string snapshot: the snapshot identifier of the visit (may be null if status is not **full**). :>jsonarr string snapshot_url: link to :http:get:`/api/1/snapshot/(snapshot_id)/` in order to get information about the snapshot of the visit (may be null if status is not **full**). :>json string status: status of the visit (either **full**, **partial** or **ongoing**) :>json number visit: the unique identifier of the visit """ DOC_RETURN_ORIGIN_VISIT_ARRAY = DOC_RETURN_ORIGIN_VISIT.replace(":>json", ":>jsonarr") DOC_RETURN_ORIGIN_VISIT_ARRAY += """ :>jsonarr number id: the unique identifier of the origin :>jsonarr string origin_visit_url: link to :http:get:`/api/1/origin/(origin_url)/visit/(visit_id)/` in order to get information about the visit """ @api_route(r"/origins/", "api-1-origins") @api_doc("/origins/", noargs=True) @format_docstring(return_origin_array=DOC_RETURN_ORIGIN_ARRAY) def api_origins(request: Request): """ .. http:get:: /api/1/origins/ Get list of archived software origins. .. warning:: This endpoint used to provide an ``origin_from`` query parameter, and guarantee an order on results. This is no longer true, and only the Link header should be used for paginating through results. :query int origin_count: The maximum number of origins to return (default to 100, can not exceed 10000) {return_origin_array} {common_headers} {resheader_link} :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`origins?origin_count=500` """ old_param_origin_from = request.query_params.get("origin_from") if old_param_origin_from: raise BadInputExc("Please use the Link header to browse through result") page_token = request.query_params.get("page_token", None) limit = min(int(request.query_params.get("origin_count", "100")), 10000) page_result = archive.lookup_origins(page_token, limit) origins = [enrich_origin(o, request=request) for o in page_result.results] next_page_token = page_result.next_page_token headers: Dict[str, str] = {} if next_page_token is not None: headers["link-next"] = reverse( "api-1-origins", - query_params={"page_token": next_page_token, "origin_count": limit}, + query_params={"page_token": next_page_token, "origin_count": str(limit)}, request=request, ) return {"results": origins, "headers": headers} @api_route(r"/origin/(?P.+)/get/", "api-1-origin") @api_doc("/origin/") @format_docstring(return_origin=DOC_RETURN_ORIGIN) def api_origin(request: Request, origin_url: str): """ .. http:get:: /api/1/origin/(origin_url)/get/ Get information about a software origin. :param string origin_url: the origin url {return_origin} {common_headers} :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/python/cpython/get/` """ ori_dict = {"url": origin_url} error_msg = "Origin with url %s not found." % ori_dict["url"] return api_lookup( archive.lookup_origin, ori_dict, notfound_msg=error_msg, enrich_fn=enrich_origin, request=request, ) def _visit_types() -> str: docstring = "" # available visit types are queried using swh-search so we do it in a try # block in case of failure (for instance in docker environment when # elasticsearch service is not available) try: visit_types = [f"**{visit_type}**" for visit_type in origin_visit_types()] docstring = ", ".join(visit_types[:-1]) + f", and {visit_types[-1]}" except Exception: docstring = "???" pass return docstring @api_route( r"/origin/search/(?P.*)/", "api-1-origin-search", throttle_scope="swh_api_origin_search", ) @api_doc("/origin/search/") @format_docstring( return_origin_array=DOC_RETURN_ORIGIN_ARRAY, visit_types=_visit_types() ) def api_origin_search(request: Request, url_pattern: str): """ .. http:get:: /api/1/origin/search/(url_pattern)/ Search for software origins whose urls contain a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. .. warning:: This endpoint used to provide an ``offset`` query parameter, and guarantee an order on results. This is no longer true, and only the Link header should be used for paginating through results. :param string url_pattern: a string pattern :query boolean use_ql: whether to use swh search query language or not :query int limit: the maximum number of found origins to return (bounded to 1000) :query boolean with_visit: if true, only return origins with at least one visit by Software heritage :query string visit_type: if provided, only return origins with that specific visit type (currently the supported types are {visit_types}) {return_origin_array} {common_headers} {resheader_link} :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`origin/search/python/?limit=2` """ result = {} limit = min(int(request.query_params.get("limit", "70")), 1000) page_token = request.query_params.get("page_token") use_ql = request.query_params.get("use_ql", "false") with_visit = request.query_params.get("with_visit", "false") visit_type = request.query_params.get("visit_type") try: (results, page_token) = api_lookup( archive.search_origin, url_pattern, bool(strtobool(use_ql)), limit, bool(strtobool(with_visit)), [visit_type] if visit_type else None, page_token, enrich_fn=enrich_origin_search_result, request=request, ) except SearchQuerySyntaxError as e: raise BadInputExc(f"Syntax error in search query: {e.args[0]}") if page_token is not None: query_params = {k: v for (k, v) in request.GET.dict().items()} query_params["page_token"] = page_token result["headers"] = { "link-next": reverse( "api-1-origin-search", url_args={"url_pattern": url_pattern}, query_params=query_params, request=request, ) } result.update({"results": results}) return result @api_route(r"/origin/metadata-search/", "api-1-origin-metadata-search") @api_doc("/origin/metadata-search/", noargs=True) @format_docstring(return_origin_array=DOC_RETURN_ORIGIN_ARRAY) def api_origin_metadata_search(request: Request): """ .. http:get:: /api/1/origin/metadata-search/ Search for software origins whose metadata (expressed as a JSON-LD/CodeMeta dictionary) match the provided criteria. For now, only full-text search on this dictionary is supported. :query str fulltext: a string that will be matched against origin metadata; results are ranked and ordered starting with the best ones. :query int limit: the maximum number of found origins to return (bounded to 100) {return_origin_array} {common_headers} :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`origin/metadata-search/?limit=2&fulltext=Jane%20Doe` """ fulltext = request.query_params.get("fulltext", None) limit = min(int(request.query_params.get("limit", "70")), 100) if not fulltext: content = '"fulltext" must be provided and non-empty.' raise BadInputExc(content) results = api_lookup( archive.search_origin_metadata, fulltext, limit, request=request ) return { "results": results, } @api_route(r"/origin/(?P.+)/visits/", "api-1-origin-visits") @api_doc("/origin/visits/") @format_docstring(return_origin_visit_array=DOC_RETURN_ORIGIN_VISIT_ARRAY) def api_origin_visits(request: Request, origin_url: str): """ .. http:get:: /api/1/origin/(origin_url)/visits/ Get information about all visits of a software origin. Visits are returned sorted in descending order according to their date. :param str origin_url: a software origin URL :query int per_page: specify the number of visits to list, for pagination purposes :query int last_visit: visit to start listing from, for pagination purposes {common_headers} {resheader_link} {return_origin_visit_array} :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/hylang/hy/visits/` """ result = {} origin_query = {"url": origin_url} notfound_msg = "No origin {} found".format(origin_url) url_args_next = {"origin_url": origin_url} per_page = int(request.query_params.get("per_page", "10")) last_visit_str = request.query_params.get("last_visit") last_visit = int(last_visit_str) if last_visit_str else None def _lookup_origin_visits(origin_query, last_visit=last_visit, per_page=per_page): all_visits = get_origin_visits(origin_query) all_visits.reverse() visits = [] if not last_visit: visits = all_visits[:per_page] else: for i, v in enumerate(all_visits): if v["visit"] == last_visit: visits = all_visits[i + 1 : i + 1 + per_page] break for v in visits: yield v results = api_lookup( _lookup_origin_visits, origin_query, notfound_msg=notfound_msg, enrich_fn=partial( enrich_origin_visit, with_origin_link=False, with_origin_visit_link=True ), request=request, ) if results: nb_results = len(results) if nb_results == per_page: new_last_visit = results[-1]["visit"] query_params = {} query_params["last_visit"] = new_last_visit if request.query_params.get("per_page"): query_params["per_page"] = per_page result["headers"] = { "link-next": reverse( "api-1-origin-visits", url_args=url_args_next, query_params=query_params, request=request, ) } result.update({"results": results}) return result @api_route( r"/origin/(?P.+)/visit/latest/", "api-1-origin-visit-latest", throttle_scope="swh_api_origin_visit_latest", ) @api_doc("/origin/visit/latest/") @format_docstring(return_origin_visit=DOC_RETURN_ORIGIN_VISIT) def api_origin_visit_latest(request: Request, origin_url: str): """ .. http:get:: /api/1/origin/(origin_url)/visit/latest/ Get information about the latest visit of a software origin. :param str origin_url: a software origin URL :query boolean require_snapshot: if true, only return a visit with a snapshot {common_headers} {return_origin_visit} :statuscode 200: no error :statuscode 404: requested origin or visit can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/hylang/hy/visit/latest/` """ require_snapshot = request.query_params.get("require_snapshot", "false") return api_lookup( archive.lookup_origin_visit_latest, origin_url, bool(strtobool(require_snapshot)), notfound_msg=("No visit for origin {} found".format(origin_url)), enrich_fn=partial( enrich_origin_visit, with_origin_link=True, with_origin_visit_link=False ), request=request, ) @api_route( r"/origin/(?P.+)/visit/(?P[0-9]+)/", "api-1-origin-visit" ) @api_doc("/origin/visit/") @format_docstring(return_origin_visit=DOC_RETURN_ORIGIN_VISIT) def api_origin_visit(request: Request, visit_id: str, origin_url: str): """ .. http:get:: /api/1/origin/(origin_url)/visit/(visit_id)/ Get information about a specific visit of a software origin. :param str origin_url: a software origin URL :param int visit_id: a visit identifier {common_headers} {return_origin_visit} :statuscode 200: no error :statuscode 404: requested origin or visit can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/hylang/hy/visit/1/` """ return api_lookup( archive.lookup_origin_visit, origin_url, int(visit_id), notfound_msg=("No visit {} for origin {} found".format(visit_id, origin_url)), enrich_fn=partial( enrich_origin_visit, with_origin_link=True, with_origin_visit_link=False ), request=request, ) @api_route( r"/origin/(?P.+)/intrinsic-metadata/", "api-origin-intrinsic-metadata" ) @api_doc("/origin/intrinsic-metadata/") @format_docstring() def api_origin_intrinsic_metadata(request: Request, origin_url: str): """ .. http:get:: /api/1/origin/(origin_url)/intrinsic-metadata Get intrinsic metadata of a software origin (as a JSON-LD/CodeMeta dictionary). :param string origin_url: the origin url :>json string ???: intrinsic metadata field of the origin {common_headers} :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/python/cpython/intrinsic-metadata` """ return api_lookup( archive.lookup_origin_intrinsic_metadata, origin_url, notfound_msg=f"Origin with url {origin_url} not found", enrich_fn=enrich_origin, request=request, ) diff --git a/swh/web/api/views/snapshot.py b/swh/web/api/views/snapshot.py index 02b7e6ee..03244e0e 100644 --- a/swh/web/api/views/snapshot.py +++ b/swh/web/api/views/snapshot.py @@ -1,106 +1,106 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.request import Request from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.utils import enrich_snapshot from swh.web.api.views.utils import api_lookup from swh.web.common import archive from swh.web.common.utils import reverse from swh.web.config import get_config @api_route( r"/snapshot/(?P[0-9a-f]+)/", "api-1-snapshot", checksum_args=["snapshot_id"], ) @api_doc("/snapshot/") @format_docstring() def api_snapshot(request: Request, snapshot_id: str): """ .. http:get:: /api/1/snapshot/(snapshot_id)/ Get information about a snapshot in the archive. A snapshot is a set of named branches, which are pointers to objects at any level of the Software Heritage DAG. It represents a full picture of an origin at a given time. As well as pointing to other objects in the Software Heritage DAG, branches can also be aliases, in which case their target is the name of another branch in the same snapshot, or dangling, in which case the target is unknown. A snapshot identifier is a salted sha1. See :func:`swh.model.git_objects.snapshot_git_object` in our data model module for details about how they are computed. :param sha1 snapshot_id: a snapshot identifier :query str branches_from: optional parameter used to skip branches whose name is lesser than it before returning them :query int branches_count: optional parameter used to restrain the amount of returned branches (default to 1000) :query str target_types: optional comma separated list parameter used to filter the target types of branch to return (possible values that can be contained in that list are ``content``, ``directory``, ``revision``, ``release``, ``snapshot`` or ``alias``) {common_headers} {resheader_link} :>json object branches: object containing all branches associated to the snapshot,for each of them the associated target type and id are given but also a link to get information about that target :>json string id: the unique identifier of the snapshot :statuscode 200: no error :statuscode 400: an invalid snapshot identifier has been provided :statuscode 404: requested snapshot can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`snapshot/6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a/` """ snapshot_content_max_size = get_config()["snapshot_content_max_size"] branches_from = request.GET.get("branches_from", "") branches_count = int(request.GET.get("branches_count", snapshot_content_max_size)) target_types_str = request.GET.get("target_types", None) target_types = target_types_str.split(",") if target_types_str else None results = api_lookup( archive.lookup_snapshot, snapshot_id, branches_from, branches_count, target_types, branch_name_exclude_prefix=None, notfound_msg="Snapshot with id {} not found.".format(snapshot_id), enrich_fn=enrich_snapshot, request=request, ) response = {"results": results, "headers": {}} if results["next_branch"] is not None: response["headers"]["link-next"] = reverse( "api-1-snapshot", url_args={"snapshot_id": snapshot_id}, query_params={ "branches_from": results["next_branch"], - "branches_count": branches_count, - "target_types": target_types, + "branches_count": str(branches_count), + "target_types": ",".join(target_types) if target_types else None, }, request=request, ) return response diff --git a/swh/web/browse/identifiers.py b/swh/web/browse/identifiers.py index f096a250..8120b4f0 100644 --- a/swh/web/browse/identifiers.py +++ b/swh/web/browse/identifiers.py @@ -1,20 +1,20 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.http import HttpRequest, HttpResponse from django.shortcuts import redirect from swh.web.common.identifiers import resolve_swhid def swhid_browse(request: HttpRequest, swhid: str) -> HttpResponse: """ Django view enabling to browse the archive using :ref:`persistent-identifiers`. The url that points to it is :http:get:`/(swhid)/`. """ - swhid_resolved = resolve_swhid(swhid, query_params=request.GET) + swhid_resolved = resolve_swhid(swhid, query_params=request.GET.dict()) assert swhid_resolved["browse_url"] return redirect(swhid_resolved["browse_url"]) diff --git a/swh/web/browse/views/revision.py b/swh/web/browse/views/revision.py index 4174b191..f2fd0af2 100644 --- a/swh/web/browse/views/revision.py +++ b/swh/web/browse/views/revision.py @@ -1,600 +1,600 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import json import textwrap from typing import Any, Dict, List, Optional from django.http import HttpRequest, HttpResponse, JsonResponse from django.shortcuts import render from django.utils.safestring import mark_safe from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType from swh.web.browse.browseurls import browse_route from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.browse.utils import ( content_display_max_size, format_log_entries, gen_link, gen_person_mail_link, gen_revision_url, get_directory_entries, get_readme_to_display, get_revision_log_url, prepare_content_for_display, request_content, ) from swh.web.common import archive from swh.web.common.exc import NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhids_info from swh.web.common.typing import RevisionMetadata, SnapshotContext, SWHObjectInfo from swh.web.common.utils import ( format_utc_iso_date, gen_path_info, reverse, swh_object_icons, ) def _gen_content_url( revision: Dict[str, Any], query_string: str, path: str, snapshot_context: Optional[SnapshotContext], ) -> str: if snapshot_context: query_params = snapshot_context["query_params"] query_params["path"] = path query_params["revision"] = revision["id"] content_url = reverse("browse-origin-content", query_params=query_params) else: content_path = "%s/%s" % (revision["directory"], path) content_url = reverse( "browse-content", url_args={"query_string": query_string}, query_params={"path": content_path}, ) return content_url def _gen_diff_link(idx: int, diff_anchor: str, link_text: str) -> str: if idx < _max_displayed_file_diffs: return gen_link(diff_anchor, link_text) else: return link_text # TODO: put in conf _max_displayed_file_diffs = 1000 def _gen_revision_changes_list( revision: Dict[str, Any], changes: List[Dict[str, Any]], snapshot_context: Optional[SnapshotContext], ) -> str: """ Returns a HTML string describing the file changes introduced in a revision. As this string will be displayed in the browse revision view, links to adequate file diffs are also generated. Args: revision (str): hexadecimal representation of a revision identifier changes (list): list of file changes in the revision snapshot_context (dict): optional origin context used to reverse the content urls Returns: A string to insert in a revision HTML view. """ changes_msg = [] for i, change in enumerate(changes): hasher = hashlib.sha1() from_query_string = "" to_query_string = "" diff_id = "diff-" if change["from"]: from_query_string = "sha1_git:" + change["from"]["target"] diff_id += change["from"]["target"] + "-" + change["from_path"] diff_id += "-" if change["to"]: to_query_string = "sha1_git:" + change["to"]["target"] diff_id += change["to"]["target"] + change["to_path"] change["path"] = change["to_path"] or change["from_path"] url_args = { "from_query_string": from_query_string, "to_query_string": to_query_string, } query_params = {"path": change["path"]} change["diff_url"] = reverse( "diff-contents", url_args=url_args, query_params=query_params ) hasher.update(diff_id.encode("utf-8")) diff_id = hasher.hexdigest() change["id"] = diff_id diff_link = "#diff_" + diff_id if change["type"] == "modify": change["content_url"] = _gen_content_url( revision, to_query_string, change["to_path"], snapshot_context ) changes_msg.append( "modified: %s" % _gen_diff_link(i, diff_link, change["to_path"]) ) elif change["type"] == "insert": change["content_url"] = _gen_content_url( revision, to_query_string, change["to_path"], snapshot_context ) changes_msg.append( "new file: %s" % _gen_diff_link(i, diff_link, change["to_path"]) ) elif change["type"] == "delete": parent = archive.lookup_revision(revision["parents"][0]) change["content_url"] = _gen_content_url( parent, from_query_string, change["from_path"], snapshot_context ) changes_msg.append( "deleted: %s" % _gen_diff_link(i, diff_link, change["from_path"]) ) elif change["type"] == "rename": change["content_url"] = _gen_content_url( revision, to_query_string, change["to_path"], snapshot_context ) link_text = change["from_path"] + " → " + change["to_path"] changes_msg.append( "renamed: %s" % _gen_diff_link(i, diff_link, link_text) ) if not changes: changes_msg.append("No changes") return mark_safe("\n".join(changes_msg)) @browse_route( r"revision/(?P[0-9a-f]+)/diff/", view_name="diff-revision", checksum_args=["sha1_git"], ) def _revision_diff(request: HttpRequest, sha1_git: str) -> HttpResponse: """ Browse internal endpoint to compute revision diff """ revision = archive.lookup_revision(sha1_git) snapshot_context = None origin_url = request.GET.get("origin_url", None) if not origin_url: origin_url = request.GET.get("origin", None) timestamp = request.GET.get("timestamp", None) visit_id_str = request.GET.get("visit_id", None) visit_id = int(visit_id_str) if visit_id_str is not None else None if origin_url: snapshot_context = get_snapshot_context( origin_url=origin_url, timestamp=timestamp, visit_id=visit_id ) changes = archive.diff_revision(sha1_git) changes_msg = _gen_revision_changes_list(revision, changes, snapshot_context) diff_data = { "total_nb_changes": len(changes), "changes": changes[:_max_displayed_file_diffs], "changes_msg": changes_msg, } return JsonResponse(diff_data) NB_LOG_ENTRIES = 100 @browse_route( r"revision/(?P[0-9a-f]+)/log/", view_name="browse-revision-log", checksum_args=["sha1_git"], ) def revision_log_browse(request: HttpRequest, sha1_git: str) -> HttpResponse: """ Django view that produces an HTML display of the history log for a revision identified by its id. The url that points to it is :http:get:`/browse/revision/(sha1_git)/log/` """ origin_url = request.GET.get("origin_url") snapshot_id = request.GET.get("snapshot") snapshot_context = None if origin_url or snapshot_id: visit_id = int(request.GET.get("visit_id", 0)) snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=request.GET.get("timestamp"), visit_id=visit_id or None, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=sha1_git, ) per_page = int(request.GET.get("per_page", NB_LOG_ENTRIES)) offset = int(request.GET.get("offset", 0)) revs_ordering = request.GET.get("revs_ordering", "committer_date") session_key = "rev_%s_log_ordering_%s" % (sha1_git, revs_ordering) rev_log_session = request.session.get(session_key, None) rev_log = [] revs_walker_state = None if rev_log_session: rev_log = rev_log_session["rev_log"] revs_walker_state = rev_log_session["revs_walker_state"] if len(rev_log) < offset + per_page: revs_walker = archive.get_revisions_walker( revs_ordering, sha1_git, max_revs=offset + per_page + 1, state=revs_walker_state, ) rev_log += [rev["id"] for rev in revs_walker] revs_walker_state = revs_walker.export_state() revs = rev_log[offset : offset + per_page] revision_log = archive.lookup_revision_multiple(revs) request.session[session_key] = { "rev_log": rev_log, "revs_walker_state": revs_walker_state, } revs_ordering = request.GET.get("revs_ordering", "") prev_log_url = None if len(rev_log) > offset + per_page: prev_log_url = reverse( "browse-revision-log", url_args={"sha1_git": sha1_git}, query_params={ - "per_page": per_page, - "offset": offset + per_page, + "per_page": str(per_page), + "offset": str(offset + per_page), "revs_ordering": revs_ordering or None, }, ) next_log_url = None if offset != 0: next_log_url = reverse( "browse-revision-log", url_args={"sha1_git": sha1_git}, query_params={ - "per_page": per_page, - "offset": offset - per_page, + "per_page": str(per_page), + "offset": str(offset - per_page), "revs_ordering": revs_ordering or None, }, ) revision_log_data = format_log_entries(revision_log, per_page) swh_rev_id = str( CoreSWHID(object_type=ObjectType.REVISION, object_id=hash_to_bytes(sha1_git)) ) return render( request, "browse/revision-log.html", { "heading": "Revision history", "swh_object_id": swh_rev_id, "swh_object_name": "Revisions history", "swh_object_metadata": None, "revision_log": revision_log_data, "revs_ordering": revs_ordering, "next_log_url": next_log_url, "prev_log_url": prev_log_url, "breadcrumbs": None, "top_right_link": None, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": True, "swhids_info": None, }, ) @browse_route( r"revision/(?P[0-9a-f]+)/", view_name="browse-revision", checksum_args=["sha1_git"], ) def revision_browse(request: HttpRequest, sha1_git: str) -> HttpResponse: """ Django view that produces an HTML display of a revision identified by its id. The url that points to it is :http:get:`/browse/revision/(sha1_git)/`. """ revision = archive.lookup_revision(sha1_git) origin_info = None snapshot_context = None origin_url = request.GET.get("origin_url") if not origin_url: origin_url = request.GET.get("origin") timestamp = request.GET.get("timestamp") visit_id = int(request.GET.get("visit_id", 0)) snapshot_id = request.GET.get("snapshot_id") if not snapshot_id: snapshot_id = request.GET.get("snapshot") path = request.GET.get("path") dir_id = None dirs, files = [], [] content_data = {} if origin_url: try: snapshot_context = get_snapshot_context( snapshot_id=snapshot_id, origin_url=origin_url, timestamp=timestamp, visit_id=visit_id or None, branch_name=request.GET.get("branch"), release_name=request.GET.get("release"), revision_id=sha1_git, path=path, ) except NotFoundExc as e: raw_rev_url = reverse("browse-revision", url_args={"sha1_git": sha1_git}) error_message = ( "The Software Heritage archive has a revision " "with the hash you provided but the origin " "mentioned in your request appears broken: %s. " "Please check the URL and try again.\n\n" "Nevertheless, you can still browse the revision " "without origin information: %s" % (gen_link(origin_url), gen_link(raw_rev_url)) ) if str(e).startswith("Origin"): raise NotFoundExc(error_message) else: raise e origin_info = snapshot_context["origin_info"] snapshot_id = snapshot_context["snapshot_id"] elif snapshot_id: snapshot_context = get_snapshot_context(snapshot_id) error_info: Dict[str, Any] = {"status_code": 200, "description": None} if path: try: file_info = archive.lookup_directory_with_path(revision["directory"], path) if file_info["type"] == "dir": dir_id = file_info["target"] else: query_string = "sha1_git:" + file_info["target"] content_data = request_content(query_string) except NotFoundExc as e: error_info["status_code"] = 404 error_info["description"] = f"NotFoundExc: {str(e)}" else: dir_id = revision["directory"] if dir_id: path = "" if path is None else (path + "/") dirs, files = get_directory_entries(dir_id) revision_metadata = RevisionMetadata( object_type=ObjectType.REVISION, object_id=sha1_git, revision=sha1_git, author=revision["author"]["fullname"] if revision["author"] else "None", author_url=gen_person_mail_link(revision["author"]) if revision["author"] else "None", committer=revision["committer"]["fullname"] if revision["committer"] else "None", committer_url=gen_person_mail_link(revision["committer"]) if revision["committer"] else "None", committer_date=format_utc_iso_date(revision["committer_date"]), date=format_utc_iso_date(revision["date"]), directory=revision["directory"], merge=revision["merge"], metadata=json.dumps( revision["metadata"], sort_keys=True, indent=4, separators=(",", ": ") ), parents=revision["parents"], synthetic=revision["synthetic"], type=revision["type"], snapshot=snapshot_id, origin_url=origin_url, ) message_lines = ["None"] if revision["message"]: message_lines = revision["message"].split("\n") parents = [] for p in revision["parents"]: parent_url = gen_revision_url(p, snapshot_context) parents.append({"id": p, "url": parent_url}) path_info = gen_path_info(path) query_params = snapshot_context["query_params"] if snapshot_context else {} breadcrumbs = [] breadcrumbs.append( { "name": revision["directory"][:7], "url": reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ), } ) for pi in path_info: query_params["path"] = pi["path"] breadcrumbs.append( { "name": pi["name"], "url": reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ), } ) vault_cooking = { "directory_context": False, "directory_swhid": None, "revision_context": True, "revision_swhid": f"swh:1:rev:{sha1_git}", } swh_objects = [SWHObjectInfo(object_type=ObjectType.REVISION, object_id=sha1_git)] content = None content_size = None filename = None mimetype = None language = None readme_name = None readme_url = None readme_html = None readmes = {} extra_context = dict(revision_metadata) extra_context["path"] = f"/{path}" if path else None if content_data: breadcrumbs[-1]["url"] = None content_size = content_data["length"] mimetype = content_data["mimetype"] if content_data["raw_data"]: content_display_data = prepare_content_for_display( content_data["raw_data"], content_data["mimetype"], path ) content = content_display_data["content_data"] language = content_display_data["language"] mimetype = content_display_data["mimetype"] if path: filename = path_info[-1]["name"] query_params["filename"] = filename filepath = "/".join(pi["name"] for pi in path_info[:-1]) extra_context["path"] = f"/{filepath}/" if filepath else "/" extra_context["filename"] = filename top_right_link = { "url": reverse( "browse-content-raw", url_args={"query_string": query_string}, query_params={"filename": filename}, ), "icon": swh_object_icons["content"], "text": "Raw File", } swh_objects.append( SWHObjectInfo(object_type=ObjectType.CONTENT, object_id=file_info["target"]) ) else: for d in dirs: if d["type"] == "rev": d["url"] = reverse( "browse-revision", url_args={"sha1_git": d["target"]} ) else: query_params["path"] = path + d["name"] d["url"] = reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ) for f in files: query_params["path"] = path + f["name"] f["url"] = reverse( "browse-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ) if f["name"].lower().startswith("readme"): readmes[f["name"]] = f["checksums"]["sha1"] readme_name, readme_url, readme_html = get_readme_to_display(readmes) top_right_link = { "url": get_revision_log_url(sha1_git, snapshot_context), "icon": swh_object_icons["revisions history"], "text": "History", } vault_cooking["directory_context"] = True vault_cooking["directory_swhid"] = f"swh:1:dir:{dir_id}" swh_objects.append( SWHObjectInfo(object_type=ObjectType.DIRECTORY, object_id=dir_id) ) query_params.pop("path", None) diff_revision_url = reverse( "diff-revision", url_args={"sha1_git": sha1_git}, query_params=query_params, ) if snapshot_id: swh_objects.append( SWHObjectInfo(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id) ) swhids_info = get_swhids_info(swh_objects, snapshot_context, extra_context) heading = "Revision - %s - %s" % ( sha1_git[:7], textwrap.shorten(message_lines[0], width=70), ) if snapshot_context: context_found = "snapshot: %s" % snapshot_context["snapshot_id"] if origin_info: context_found = "origin: %s" % origin_info["url"] heading += " - %s" % context_found return render( request, "browse/revision.html", { "heading": heading, "swh_object_id": swhids_info[0]["swhid"], "swh_object_name": "Revision", "swh_object_metadata": revision_metadata, "message_header": message_lines[0], "message_body": "\n".join(message_lines[1:]), "parents": parents, "snapshot_context": snapshot_context, "dirs": dirs, "files": files, "content": content, "content_size": content_size, "max_content_size": content_display_max_size, "filename": filename, "encoding": content_data.get("encoding"), "mimetype": mimetype, "language": language, "readme_name": readme_name, "readme_url": readme_url, "readme_html": readme_html, "breadcrumbs": breadcrumbs, "top_right_link": top_right_link, "vault_cooking": vault_cooking, "diff_revision_url": diff_revision_url, "show_actions": True, "swhids_info": swhids_info, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], }, status=error_info["status_code"], ) diff --git a/swh/web/browse/views/snapshot.py b/swh/web/browse/views/snapshot.py index 4bc775b5..c02258f4 100644 --- a/swh/web/browse/views/snapshot.py +++ b/swh/web/browse/views/snapshot.py @@ -1,237 +1,237 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Optional from django.http import HttpRequest, HttpResponse from django.shortcuts import redirect from swh.web.browse.browseurls import browse_route from swh.web.browse.snapshot_context import ( browse_snapshot_branches, browse_snapshot_directory, browse_snapshot_log, browse_snapshot_releases, get_snapshot_context, ) from swh.web.common.exc import BadInputExc from swh.web.common.utils import redirect_to_new_route, reverse def get_snapshot_from_request(request: HttpRequest) -> str: snapshot = request.GET.get("snapshot") if snapshot: return snapshot if request.GET.get("origin_url") is None: raise BadInputExc("An origin URL must be provided as a query parameter.") return get_snapshot_context( origin_url=request.GET.get("origin_url"), timestamp=request.GET.get("timestamp") )["snapshot_id"] @browse_route( r"snapshot/(?P[0-9a-f]+)/", view_name="browse-snapshot", checksum_args=["snapshot_id"], ) def snapshot_browse(request: HttpRequest, snapshot_id: str) -> HttpResponse: """Django view for browsing the content of a snapshot. The url that points to it is :http:get:`/browse/snapshot/(snapshot_id)/` """ browse_snapshot_url = reverse( "browse-snapshot-directory", url_args={"snapshot_id": snapshot_id}, - query_params=request.GET, + query_params=request.GET.dict(), ) return redirect(browse_snapshot_url) @browse_route( r"snapshot/(?P[0-9a-f]+)/directory/", view_name="browse-snapshot-directory", checksum_args=["snapshot_id"], ) def snapshot_directory_browse(request: HttpRequest, snapshot_id: str) -> HttpResponse: """Django view for browsing the content of a directory collected in a snapshot. The URL that points to it is :http:get:`/browse/snapshot/(snapshot_id)/directory/` """ return browse_snapshot_directory( request, snapshot_id=snapshot_id, path=request.GET.get("path"), origin_url=request.GET.get("origin_url"), ) @browse_route( r"snapshot/(?P[0-9a-f]+)/directory/(?P.+)/", view_name="browse-snapshot-directory-legacy", checksum_args=["snapshot_id"], ) def snapshot_directory_browse_legacy( request: HttpRequest, snapshot_id: str, path: Optional[str] = None ) -> HttpResponse: """Django view for browsing the content of a directory collected in a snapshot. The URL that points to it is :http:get:`/browse/snapshot/(snapshot_id)/directory/(path)/` """ origin_url = request.GET.get("origin_url", None) if not origin_url: origin_url = request.GET.get("origin", None) return browse_snapshot_directory( request, snapshot_id=snapshot_id, path=path, origin_url=origin_url ) @browse_route( r"snapshot/(?P[0-9a-f]+)/content/", view_name="browse-snapshot-content", checksum_args=["snapshot_id"], ) def snapshot_content_browse(request: HttpRequest, snapshot_id: str) -> HttpResponse: """ This route is deprecated; use http:get:`/browse/content` instead Django view that produces an HTML display of a content collected in a snapshot. The url that points to it is :http:get:`/browse/snapshot/(snapshot_id)/content/` """ return redirect_to_new_route(request, "browse-content") @browse_route( r"snapshot/(?P[0-9a-f]+)/content/(?P.+)/", view_name="browse-snapshot-content-legacy", checksum_args=["snapshot_id"], ) def snapshot_content_browse_legacy( request: HttpRequest, snapshot_id: str, path: str ) -> HttpResponse: """ This route is deprecated; use http:get:`/browse/content` instead Django view that produces an HTML display of a content collected in a snapshot. The url that points to it is :http:get:`/browse/snapshot/(snapshot_id)/content/(path)/` """ return redirect_to_new_route(request, "browse-content") @browse_route( r"snapshot/(?P[0-9a-f]+)/log/", r"snapshot/log/", view_name="browse-snapshot-log", checksum_args=["snapshot_id"], ) def snapshot_log_browse( request: HttpRequest, snapshot_id: Optional[str] = None ) -> HttpResponse: """Django view that produces an HTML display of revisions history (aka the commit log) collected in a snapshot. The URLs that point to it are :http:get:`/browse/snapshot/(snapshot_id)/log/` and :http:get:`/browse/snapshot/log/` """ if snapshot_id is None: # This case happens when redirected from /origin/log snapshot_id = get_snapshot_from_request(request) # Redirect to the same route with snapshot_id return redirect( reverse( "browse-snapshot-log", url_args={"snapshot_id": snapshot_id}, query_params=request.GET, ), ) return browse_snapshot_log( request, snapshot_id=snapshot_id, origin_url=request.GET.get("origin_url"), timestamp=request.GET.get("timestamp"), ) @browse_route( r"snapshot/(?P[0-9a-f]+)/branches/", r"snapshot/branches/", view_name="browse-snapshot-branches", checksum_args=["snapshot_id"], ) def snapshot_branches_browse( request: HttpRequest, snapshot_id: Optional[str] = None ) -> HttpResponse: """Django view that produces an HTML display of the list of branches collected in a snapshot. The URLs that point to it are :http:get:`/browse/snapshot/(snapshot_id)/branches/` and :http:get:`/browse/snapshot/branches/` """ if snapshot_id is None: # This case happens when redirected from /origin/branches snapshot_id = get_snapshot_from_request(request) # Redirect to the same route with the newest snapshot_id # for the given origin return redirect( reverse( "browse-snapshot-branches", url_args={"snapshot_id": snapshot_id}, query_params=request.GET, ), ) return browse_snapshot_branches( request, snapshot_id=snapshot_id, origin_url=request.GET.get("origin_url"), timestamp=request.GET.get("timestamp"), branch_name_include=request.GET.get("name_include"), ) @browse_route( r"snapshot/(?P[0-9a-f]+)/releases/", r"snapshot/releases/", view_name="browse-snapshot-releases", checksum_args=["snapshot_id"], ) def snapshot_releases_browse( request: HttpRequest, snapshot_id: Optional[str] = None ) -> HttpResponse: """Django view that produces an HTML display of the list of releases collected in a snapshot. The URLs that point to it are :http:get:`/browse/snapshot/(snapshot_id)/releases/` :http:get:`/browse/snapshot/releases/` """ if snapshot_id is None: # This case happens when redirected from /origin/releases snapshot_id = get_snapshot_from_request(request) # Redirect to the same route with the newest snapshot_id # for the given origin return redirect( reverse( "browse-snapshot-releases", url_args={"snapshot_id": snapshot_id}, query_params=request.GET, ), ) return browse_snapshot_releases( request, snapshot_id=snapshot_id, origin_url=request.GET.get("origin_url"), timestamp=request.GET.get("timestamp"), release_name_include=request.GET.get("name_include"), ) diff --git a/swh/web/common/identifiers.py b/swh/web/common/identifiers.py index 226e63e6..295a7592 100644 --- a/swh/web/common/identifiers.py +++ b/swh/web/common/identifiers.py @@ -1,387 +1,381 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Iterable, List, Mapping, Optional from urllib.parse import quote, unquote from typing_extensions import TypedDict -from django.http import QueryDict - from swh.model.exceptions import ValidationError from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.swhids import ObjectType, QualifiedSWHID from swh.web.common import archive from swh.web.common.exc import BadInputExc from swh.web.common.typing import ( - QueryParameters, SnapshotContext, SWHIDContext, SWHIDInfo, SWHObjectInfo, ) from swh.web.common.utils import reverse def parse_object_type(object_type: str) -> ObjectType: try: return ObjectType[object_type.upper()] except KeyError: valid_types = ", ".join(variant.name.lower() for variant in ObjectType) raise BadInputExc( f"Invalid swh object type! Valid types are {valid_types}; not {object_type}" ) def gen_swhid( object_type: ObjectType, object_id: str, scheme_version: int = 1, metadata: SWHIDContext = {}, ) -> str: """ Returns the SoftWare Heritage persistent IDentifier for a swh object based on: * the object type * the object id * the SWHID scheme version Args: object_type: the swh object type (content/directory/release/revision/snapshot) object_id: the swh object id (hexadecimal representation of its hash value) scheme_version: the scheme version of the SWHIDs Returns: the SWHID of the object Raises: BadInputExc: if the provided parameters do not enable to generate a valid identifier """ try: decoded_object_id = hash_to_bytes(object_id) obj_swhid = str( QualifiedSWHID( object_type=object_type, object_id=decoded_object_id, scheme_version=scheme_version, **metadata, ) ) except (ValidationError, KeyError, ValueError) as e: raise BadInputExc("Invalid object (%s) for SWHID. %s" % (object_id, e)) else: return obj_swhid class ResolvedSWHID(TypedDict): """parsed SWHID with context""" swhid_parsed: QualifiedSWHID """URL to browse object according to SWHID context""" browse_url: Optional[str] def resolve_swhid( - swhid: str, query_params: Optional[QueryParameters] = None + swhid: str, query_params: Optional[Mapping[str, str]] = None ) -> ResolvedSWHID: """ Try to resolve a SoftWare Heritage persistent IDentifier into an url for browsing the targeted object. Args: swhid: a SoftWare Heritage persistent IDentifier query_params: optional dict filled with query parameters to append to the browse url Returns: a dict with the following keys: * **swhid_parsed**: the parsed identifier * **browse_url**: the url for browsing the targeted object """ swhid_parsed = get_swhid(swhid) object_type = swhid_parsed.object_type object_id = swhid_parsed.object_id browse_url = None url_args = {} - query_dict = QueryDict("", mutable=True) fragment = "" process_lines = object_type == ObjectType.CONTENT - if query_params and len(query_params) > 0: - for k in sorted(query_params.keys()): - query_dict[k] = str(query_params[k]) + query_dict: Dict[str, str] = dict(query_params or {}) if swhid_parsed.origin: origin_url = unquote(swhid_parsed.origin) origin_url = archive.lookup_origin({"url": origin_url})["url"] query_dict["origin_url"] = origin_url if swhid_parsed.path and swhid_parsed.path != b"/": query_dict["path"] = swhid_parsed.path.decode("utf8", errors="replace") if swhid_parsed.anchor: directory = b"" if swhid_parsed.anchor.object_type == ObjectType.DIRECTORY: directory = swhid_parsed.anchor.object_id elif swhid_parsed.anchor.object_type == ObjectType.REVISION: revision = archive.lookup_revision( hash_to_hex(swhid_parsed.anchor.object_id) ) directory = revision["directory"] elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release["target_type"] == ObjectType.REVISION.name.lower(): revision = archive.lookup_revision(release["target"]) directory = revision["directory"] if object_type == ObjectType.CONTENT: if ( not swhid_parsed.origin and swhid_parsed.anchor.object_type != ObjectType.REVISION ): # when no origin or revision context, content objects need to have # their path prefixed by root directory id for breadcrumbs display query_dict["path"] = hash_to_hex(directory) + query_dict["path"] - else: + elif query_dict["path"] is not None: # remove leading slash from SWHID content path - query_dict["path"] = str(query_dict["path"]).lstrip("/") - elif object_type == ObjectType.DIRECTORY: + query_dict["path"] = query_dict["path"].lstrip("/") + elif object_type == ObjectType.DIRECTORY and query_dict["path"] is not None: object_id = directory # remove leading and trailing slashes from SWHID directory path - query_dict["path"] = str(query_dict["path"]).strip("/") + query_dict["path"] = query_dict["path"].strip("/") # snapshot context if swhid_parsed.visit: if swhid_parsed.visit.object_type != ObjectType.SNAPSHOT: raise BadInputExc("Visit must be a snapshot SWHID.") query_dict["snapshot"] = hash_to_hex(swhid_parsed.visit.object_id) if swhid_parsed.anchor: if ( swhid_parsed.anchor.object_type == ObjectType.REVISION and object_type != ObjectType.REVISION ): query_dict["revision"] = hash_to_hex(swhid_parsed.anchor.object_id) elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release: query_dict["release"] = release["name"] # browsing content or directory without snapshot context elif ( object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY) and swhid_parsed.anchor ): if swhid_parsed.anchor.object_type == ObjectType.REVISION: # anchor revision, objects are browsed from its view object_type = ObjectType.REVISION object_id = swhid_parsed.anchor.object_id elif ( object_type == ObjectType.DIRECTORY and swhid_parsed.anchor.object_type == ObjectType.DIRECTORY ): # a directory is browsed from its root object_id = swhid_parsed.anchor.object_id if object_type == ObjectType.CONTENT: url_args["query_string"] = f"sha1_git:{hash_to_hex(object_id)}" elif object_type in (ObjectType.DIRECTORY, ObjectType.RELEASE, ObjectType.REVISION): url_args["sha1_git"] = hash_to_hex(object_id) elif object_type == ObjectType.SNAPSHOT: url_args["snapshot_id"] = hash_to_hex(object_id) if swhid_parsed.lines and process_lines: lines = swhid_parsed.lines fragment += "#L" + str(lines[0]) if lines[1]: fragment += "-L" + str(lines[1]) if url_args: browse_url = ( reverse( f"browse-{object_type.name.lower()}", url_args=url_args, query_params=query_dict, ) + fragment ) return ResolvedSWHID(swhid_parsed=swhid_parsed, browse_url=browse_url) def get_swhid(swhid: str) -> QualifiedSWHID: """Check if a SWHID is valid and return it parsed. Args: swhid: a SoftWare Heritage persistent IDentifier. Raises: BadInputExc: if the provided SWHID can not be parsed. Return: A parsed SWHID. """ try: # ensure core part of SWHID is in lower case to avoid parsing error (core, sep, qualifiers) = swhid.partition(";") core = core.lower() return QualifiedSWHID.from_string(core + sep + qualifiers) except ValidationError as ve: raise BadInputExc("Error when parsing identifier: %s" % " ".join(ve.messages)) def group_swhids( swhids: Iterable[QualifiedSWHID], ) -> Dict[ObjectType, List[bytes]]: """ Groups many SoftWare Heritage persistent IDentifiers into a dictionary depending on their type. Args: swhids: an iterable of SoftWare Heritage persistent IDentifier objects Returns: A dictionary with: keys: object types values: object hashes """ swhids_by_type: Dict[ObjectType, List[bytes]] = { ObjectType.CONTENT: [], ObjectType.DIRECTORY: [], ObjectType.REVISION: [], ObjectType.RELEASE: [], ObjectType.SNAPSHOT: [], } for obj_swhid in swhids: obj_id = obj_swhid.object_id obj_type = obj_swhid.object_type swhids_by_type[obj_type].append(hash_to_bytes(obj_id)) return swhids_by_type def get_swhids_info( swh_objects: Iterable[SWHObjectInfo], snapshot_context: Optional[SnapshotContext] = None, extra_context: Optional[Mapping[str, Any]] = None, ) -> List[SWHIDInfo]: """ Returns a list of dict containing info related to SWHIDs of objects. Args: swh_objects: an iterable of dict describing archived objects snapshot_context: optional dict parameter describing the snapshot in which the objects have been found extra_context: optional dict filled with extra contextual info about the objects Returns: a list of dict containing SWHIDs info """ swhids_info = [] for swh_object in swh_objects: if not swh_object["object_id"]: swhids_info.append( SWHIDInfo( object_type=swh_object["object_type"], object_id="", swhid="", swhid_url="", context={}, swhid_with_context=None, swhid_with_context_url=None, ) ) continue object_type = swh_object["object_type"] object_id = swh_object["object_id"] swhid_context: SWHIDContext = {} if snapshot_context: if snapshot_context["origin_info"] is not None: swhid_context["origin"] = quote( snapshot_context["origin_info"]["url"], safe="/?:@&" ) if object_type != ObjectType.SNAPSHOT: swhid_context["visit"] = gen_swhid( ObjectType.SNAPSHOT, snapshot_context["snapshot_id"] ) if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY): if snapshot_context["release_id"] is not None: swhid_context["anchor"] = gen_swhid( ObjectType.RELEASE, snapshot_context["release_id"] ) elif snapshot_context["revision_id"] is not None: swhid_context["anchor"] = gen_swhid( ObjectType.REVISION, snapshot_context["revision_id"] ) if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY): if ( extra_context and "revision" in extra_context and extra_context["revision"] and "anchor" not in swhid_context ): swhid_context["anchor"] = gen_swhid( ObjectType.REVISION, extra_context["revision"] ) elif ( extra_context and "root_directory" in extra_context and extra_context["root_directory"] and "anchor" not in swhid_context and ( object_type != ObjectType.DIRECTORY or extra_context["root_directory"] != object_id ) ): swhid_context["anchor"] = gen_swhid( ObjectType.DIRECTORY, extra_context["root_directory"] ) path = None if extra_context and "path" in extra_context: path = extra_context["path"] or "/" if "filename" in extra_context and object_type == ObjectType.CONTENT: path += extra_context["filename"] if object_type == ObjectType.DIRECTORY and path == "/": path = None if path: swhid_context["path"] = quote(path, safe="/?:@&") swhid = gen_swhid(object_type, object_id) swhid_url = reverse("browse-swhid", url_args={"swhid": swhid}) swhid_with_context = None swhid_with_context_url = None if swhid_context: swhid_with_context = gen_swhid( object_type, object_id, metadata=swhid_context ) swhid_with_context_url = reverse( "browse-swhid", url_args={"swhid": swhid_with_context} ) swhids_info.append( SWHIDInfo( object_type=object_type, object_id=object_id, swhid=swhid, swhid_url=swhid_url, context=swhid_context, swhid_with_context=swhid_with_context, swhid_with_context_url=swhid_with_context_url, ) ) return swhids_info diff --git a/swh/web/common/typing.py b/swh/web/common/typing.py index 6a02ad3e..f037e0f8 100644 --- a/swh/web/common/typing.py +++ b/swh/web/common/typing.py @@ -1,263 +1,259 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, List, Optional, TypeVar, Union +from typing import Any, Dict, List, Optional, TypeVar from typing_extensions import TypedDict -from django.http import QueryDict - from swh.core.api.classes import PagedResult as CorePagedResult from swh.model.swhids import ObjectType -QueryParameters = Union[Dict[str, Any], QueryDict] - class OriginInfo(TypedDict): url: str """URL of the origin""" class OriginMetadataInfo(TypedDict): url: str """URL of the origin""" metadata: Dict[str, Any] """Origin metadata associated to the origin""" class OriginVisitInfo(TypedDict): date: str """date of the visit in iso format""" formatted_date: str """formatted date of the visit""" metadata: Dict[str, Any] """metadata associated to the visit""" origin: str """visited origin URL""" snapshot: str """snapshot identifier computed during the visit""" status: str """status of the visit ("ongoing", "full" or "partial") """ type: str """visit type (git, hg, debian, ...)""" url: str """URL to browse the snapshot""" visit: int """visit identifier""" class SnapshotBranchInfo(TypedDict): date: Optional[str] """"author date of branch heading revision""" directory: Optional[str] """directory associated to branch heading revision""" message: Optional[str] """message of branch heading revision""" name: str """branch name""" alias: bool """define if the branch is an alias""" revision: str """branch heading revision""" url: Optional[str] """optional browse URL (content, directory, ...) scoped to branch""" class SnapshotReleaseInfo(TypedDict): branch_name: str """branch name associated to release in snapshot""" date: str """release date""" directory: Optional[str] """optional directory associated to the release""" id: str """release identifier""" message: str """release message""" name: str """release name""" alias: bool """define if the branch is an alias""" target: str """release target""" target_type: str """release target_type""" url: Optional[str] """optional browse URL (content, directory, ...) scoped to release""" class SnapshotContext(TypedDict): branch: Optional[str] """optional branch name set when browsing snapshot in that scope""" branch_alias: bool """indicates if the focused branch is an alias""" branches: List[SnapshotBranchInfo] """list of snapshot branches (possibly truncated)""" branches_url: str """snapshot branches list browse URL""" is_empty: bool """indicates if the snapshot is empty""" origin_info: Optional[OriginInfo] """optional origin info associated to the snapshot""" origin_visits_url: Optional[str] """optional origin visits URL""" - query_params: QueryParameters + query_params: Dict[str, Optional[str]] """common query parameters when browsing snapshot content""" release: Optional[str] """optional release name set when browsing snapshot in that scope""" release_alias: bool """indicates if the focused release is an alias""" release_id: Optional[str] """optional release identifier set when browsing snapshot in that scope""" releases: List[SnapshotReleaseInfo] """list of snapshot releases (possibly truncated)""" releases_url: str """snapshot releases list browse URL""" revision_id: Optional[str] """optional revision identifier set when browsing snapshot in that scope""" revision_info: Optional[Dict[str, Any]] """optional revision info set when browsing snapshot in that scope""" root_directory: Optional[str] """optional root directory identifier set when browsing snapshot content""" snapshot_id: str """snapshot identifier""" snapshot_sizes: Dict[str, int] """snapshot sizes grouped by branch target type""" snapshot_swhid: str """snapshot SWHID""" url_args: Dict[str, Any] """common URL arguments when browsing snapshot content""" visit_info: Optional[OriginVisitInfo] """optional origin visit info associated to the snapshot""" directory_url: Optional[str] """optional root directory URL associated to the snapshot""" class SWHObjectInfo(TypedDict): object_type: ObjectType object_id: Optional[str] class SWHIDContext(TypedDict, total=False): origin: str anchor: str visit: str path: str lines: str class SWHIDInfo(SWHObjectInfo): swhid: str swhid_url: str context: SWHIDContext swhid_with_context: Optional[str] swhid_with_context_url: Optional[str] class SWHObjectInfoMetadata(TypedDict, total=False): origin_url: Optional[str] visit_date: Optional[str] visit_type: Optional[str] class ContentMetadata(SWHObjectInfo, SWHObjectInfoMetadata): sha1: str sha1_git: str sha256: str blake2s256: str content_url: str mimetype: str encoding: str size: int language: str path: Optional[str] filename: Optional[str] directory: Optional[str] root_directory: Optional[str] revision: Optional[str] release: Optional[str] snapshot: Optional[str] class DirectoryMetadata(SWHObjectInfo, SWHObjectInfoMetadata): directory: Optional[str] nb_files: Optional[int] nb_dirs: Optional[int] sum_file_sizes: Optional[int] root_directory: Optional[str] path: Optional[str] revision: Optional[str] revision_found: Optional[bool] release: Optional[str] snapshot: Optional[str] class ReleaseMetadata(SWHObjectInfo, SWHObjectInfoMetadata): release: str author: str author_url: str date: str name: str synthetic: bool target: str target_type: str snapshot: Optional[str] class RevisionMetadata(SWHObjectInfo, SWHObjectInfoMetadata): revision: str author: str author_url: str committer: str committer_url: str date: str committer_date: str directory: str merge: bool metadata: str parents: List[str] synthetic: bool type: str snapshot: Optional[str] TResult = TypeVar("TResult") PagedResult = CorePagedResult[TResult, str] class SaveOriginRequestInfo(TypedDict, total=False): id: int """Unique key""" save_request_date: str """Date of the creation request""" visit_type: str """Type of the visit""" visit_status: Optional[str] """Status of the visit""" origin_url: str """Origin to ingest""" save_request_status: str """Status of the request""" loading_task_id: Optional[int] """Identifier of the loading task in the scheduler if scheduled""" visit_date: Optional[str] """End of the visit if terminated""" save_task_status: str """Status of the scheduled task""" note: Optional[str] """Optional note associated to the request, for instance rejection reason""" class OriginExistenceCheckInfo(TypedDict): origin_url: str """Origin to check""" exists: bool """Does the url exist?""" content_length: Optional[int] """content length of the artifact""" last_modified: Optional[str] """Last modification time reported by the server (as iso8601 string)""" diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py index ab89c23d..27345842 100644 --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -1,520 +1,519 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import functools import os import re -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Mapping, Optional import urllib.parse from bs4 import BeautifulSoup from docutils.core import publish_parts import docutils.parsers.rst import docutils.utils from docutils.writers.html5_polyglot import HTMLTranslator, Writer from iso8601 import ParseError, parse_date from pkg_resources import get_distribution from prometheus_client.registry import CollectorRegistry import requests from requests.auth import HTTPBasicAuth from django.core.cache import cache from django.core.cache.backends.base import DEFAULT_TIMEOUT from django.http import HttpRequest, QueryDict from django.shortcuts import redirect from django.urls import resolve from django.urls import reverse as django_reverse from swh.web.auth.utils import ( ADD_FORGE_MODERATOR_PERMISSION, ADMIN_LIST_DEPOSIT_PERMISSION, MAILMAP_ADMIN_PERMISSION, ) from swh.web.common.exc import BadInputExc, sentry_capture_exception -from swh.web.common.typing import QueryParameters from swh.web.config import SWH_WEB_SERVER_NAME, get_config, search SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True) SWHID_RE = "swh:1:[a-z]{3}:[0-9a-z]{40}" swh_object_icons = { "alias": "mdi mdi-star", "branch": "mdi mdi-source-branch", "branches": "mdi mdi-source-branch", "content": "mdi mdi-file-document", "cnt": "mdi mdi-file-document", "directory": "mdi mdi-folder", "dir": "mdi mdi-folder", "origin": "mdi mdi-source-repository", "ori": "mdi mdi-source-repository", "person": "mdi mdi-account", "revisions history": "mdi mdi-history", "release": "mdi mdi-tag", "rel": "mdi mdi-tag", "releases": "mdi mdi-tag", "revision": "mdi mdi-rotate-90 mdi-source-commit", "rev": "mdi mdi-rotate-90 mdi-source-commit", "snapshot": "mdi mdi-camera", "snp": "mdi mdi-camera", "visits": "mdi mdi-calendar-month", } def reverse( viewname: str, url_args: Optional[Dict[str, Any]] = None, - query_params: Optional[QueryParameters] = None, + query_params: Optional[Mapping[str, Optional[str]]] = None, current_app: Optional[str] = None, urlconf: Optional[str] = None, request: Optional[HttpRequest] = None, ) -> str: """An override of django reverse function supporting query parameters. Args: viewname: the name of the django view from which to compute a url url_args: dictionary of url arguments indexed by their names query_params: dictionary of query parameters to append to the reversed url current_app: the name of the django app tighten to the view urlconf: url configuration module request: build an absolute URI if provided Returns: str: the url of the requested view with processed arguments and query parameters """ if url_args: url_args = {k: v for k, v in url_args.items() if v is not None} url = django_reverse( viewname, urlconf=urlconf, kwargs=url_args, current_app=current_app ) + params: Dict[str, str] = {} if query_params: - query_params = {k: v for k, v in query_params.items() if v is not None} + params = {k: v for k, v in query_params.items() if v is not None} - if query_params and len(query_params) > 0: + if params: query_dict = QueryDict("", mutable=True) - for k in sorted(query_params.keys()): - query_dict[k] = str(query_params[k]) + query_dict.update(dict(sorted(params.items()))) url += "?" + query_dict.urlencode(safe="/;:") if request is not None: url = request.build_absolute_uri(url) return url def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datetime: datetime in UTC without timezone info """ if date.tzinfo and date.tzinfo != timezone.utc: return date.astimezone(tz=timezone.utc) else: return date def parse_iso8601_date_to_utc(iso_date: str) -> datetime: """Given an ISO 8601 datetime string, parse the result as UTC datetime. Returns: a timezone-aware datetime representing the parsed date Raises: swh.web.common.exc.BadInputExc: provided date does not respect ISO 8601 format Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - 2007-01-14T20:34:22Z """ try: date = parse_date(iso_date) return datetime_to_utc(date) except ParseError as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r"([0-9a-f]{8})[0-9a-z]{56}" sha1_re = r"([0-9a-f]{8})[0-9a-f]{32}" ret = re.sub(sha256_re, r"\1...", path) return re.sub(sha1_re, r"\1...", ret) def format_utc_iso_date(iso_date, fmt="%d %B %Y, %H:%M:%S UTC"): """Turns a string representation of an ISO 8601 datetime string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: str: a formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_iso8601_date_to_utc(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: list: a list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip("/").split("/") path_from_root = "" for p in sub_paths: path_from_root += "/" + p path_info.append({"name": p, "path": path_from_root.strip("/")}) return path_info def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components ).get_default_values() settings.report_level = report_level document = docutils.utils.new_document("rst-doc", settings=settings) parser.parse(text, document) return document def get_client_ip(request): """ Return the client IP address from an incoming HTTP request. Args: request (django.http.HttpRequest): the incoming HTTP request Returns: str: The client IP address """ x_forwarded_for = request.META.get("HTTP_X_FORWARDED_FOR") if x_forwarded_for: ip = x_forwarded_for.split(",")[0] else: ip = request.META.get("REMOTE_ADDR") return ip def is_swh_web_development(request: HttpRequest) -> bool: """Indicate if we are running a development version of swh-web.""" site_base_url = request.build_absolute_uri("/") return any( host in site_base_url for host in ("localhost", "127.0.0.1", "testserver") ) def is_swh_web_staging(request: HttpRequest) -> bool: """Indicate if we are running a staging version of swh-web.""" config = get_config() site_base_url = request.build_absolute_uri("/") return any( server_name in site_base_url for server_name in config["staging_server_names"] ) def is_swh_web_production(request: HttpRequest) -> bool: """Indicate if we are running the public production version of swh-web.""" return SWH_WEB_SERVER_NAME in request.build_absolute_uri("/") browsers_supported_image_mimes = set( [ "image/gif", "image/png", "image/jpeg", "image/bmp", "image/webp", "image/svg", "image/svg+xml", ] ) def context_processor(request): """ Django context processor used to inject variables in all swh-web templates. """ config = get_config() if ( hasattr(request, "user") and request.user.is_authenticated and not hasattr(request.user, "backend") ): # To avoid django.template.base.VariableDoesNotExist errors # when rendering templates when standard Django user is logged in. request.user.backend = "django.contrib.auth.backends.ModelBackend" return { "swh_object_icons": swh_object_icons, "available_languages": None, "swh_client_config": config["client_config"], "oidc_enabled": bool(config["keycloak"]["server_url"]), "browsers_supported_image_mimes": browsers_supported_image_mimes, "keycloak": config["keycloak"], "site_base_url": request.build_absolute_uri("/"), "DJANGO_SETTINGS_MODULE": os.environ["DJANGO_SETTINGS_MODULE"], "status": config["status"], "swh_web_dev": is_swh_web_development(request), "swh_web_staging": is_swh_web_staging(request), "swh_web_prod": is_swh_web_production(request), "swh_web_version": get_distribution("swh.web").version, "iframe_mode": False, "ADMIN_LIST_DEPOSIT_PERMISSION": ADMIN_LIST_DEPOSIT_PERMISSION, "ADD_FORGE_MODERATOR_PERMISSION": ADD_FORGE_MODERATOR_PERMISSION, "FEATURES": get_config()["features"], "MAILMAP_ADMIN_PERMISSION": MAILMAP_ADMIN_PERMISSION, } def resolve_branch_alias( snapshot: Dict[str, Any], branch: Optional[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: """ Resolve branch alias in snapshot content. Args: snapshot: a full snapshot content branch: a branch alias contained in the snapshot Returns: The real snapshot branch that got aliased. """ while branch and branch["target_type"] == "alias": if branch["target"] in snapshot["branches"]: branch = snapshot["branches"][branch["target"]] else: from swh.web.common import archive snp = archive.lookup_snapshot( snapshot["id"], branches_from=branch["target"], branches_count=1 ) if snp and branch["target"] in snp["branches"]: branch = snp["branches"][branch["target"]] else: branch = None return branch class _NoHeaderHTMLTranslator(HTMLTranslator): """ Docutils translator subclass to customize the generation of HTML from reST-formatted docstrings """ def __init__(self, document): super().__init__(document) self.body_prefix = [] self.body_suffix = [] _HTML_WRITER = Writer() _HTML_WRITER.translator_class = _NoHeaderHTMLTranslator def rst_to_html(rst: str) -> str: """ Convert reStructuredText document into HTML. Args: rst: A string containing a reStructuredText document Returns: Body content of the produced HTML conversion. """ settings = { "initial_header_level": 2, "halt_level": 4, "traceback": True, "file_insertion_enabled": False, "raw_enabled": False, } pp = publish_parts(rst, writer=_HTML_WRITER, settings_overrides=settings) return f'
{pp["html_body"]}
' def prettify_html(html: str) -> str: """ Prettify an HTML document. Args: html: Input HTML document Returns: The prettified HTML document """ return BeautifulSoup(html, "lxml").prettify() def django_cache( timeout: int = DEFAULT_TIMEOUT, catch_exception: bool = False, exception_return_value: Any = None, invalidate_cache_pred: Callable[[Any], bool] = lambda val: False, ): """Decorator to put the result of a function call in Django cache, subsequent calls will directly return the cached value. Args: timeout: The number of seconds value will be hold in cache catch_exception: If :const:`True`, any thrown exception by the decorated function will be caught and not reraised exception_return_value: The value to return if previous parameter is set to :const:`True` invalidate_cache_pred: A predicate function enabling to invalidate the cache under certain conditions, decorated function will then be called again Returns: The returned value of the decorated function for the specified parameters """ def inner(func): @functools.wraps(func) def wrapper(*args, **kwargs): func_args = args + (0,) + tuple(sorted(kwargs.items())) cache_key = str(hash((func.__module__, func.__name__) + func_args)) ret = cache.get(cache_key) if ret is None or invalidate_cache_pred(ret): try: ret = func(*args, **kwargs) except Exception as exc: if catch_exception: sentry_capture_exception(exc) return exception_return_value else: raise else: cache.set(cache_key, ret, timeout=timeout) return ret return wrapper return inner def _deposits_list_url( deposits_list_base_url: str, page_size: int, username: Optional[str] ) -> str: params = {"page_size": str(page_size)} if username is not None: params["username"] = username return f"{deposits_list_base_url}?{urllib.parse.urlencode(params)}" def get_deposits_list(username: Optional[str] = None) -> List[Dict[str, Any]]: """Return the list of software deposits using swh-deposit API""" config = get_config()["deposit"] private_api_url = config["private_api_url"].rstrip("/") + "/" deposits_list_base_url = private_api_url + "deposits" deposits_list_auth = HTTPBasicAuth( config["private_api_user"], config["private_api_password"] ) deposits_list_url = _deposits_list_url( deposits_list_base_url, page_size=1, username=username ) nb_deposits = requests.get( deposits_list_url, auth=deposits_list_auth, timeout=30 ).json()["count"] @django_cache(invalidate_cache_pred=lambda data: data["count"] != nb_deposits) def _get_deposits_data(): deposits_list_url = _deposits_list_url( deposits_list_base_url, page_size=nb_deposits, username=username ) return requests.get( deposits_list_url, auth=deposits_list_auth, timeout=30, ).json() deposits_data = _get_deposits_data() return deposits_data["results"] _origin_visit_types_cache_timeout = 24 * 60 * 60 # 24 hours @django_cache( timeout=_origin_visit_types_cache_timeout, catch_exception=True, exception_return_value=[], ) def origin_visit_types() -> List[str]: """Return the exhaustive list of visit types for origins ingested into the archive. """ return sorted(search().visit_types_count().keys()) def redirect_to_new_route(request, new_route, permanent=True): """Redirect a request to another route with url args and query parameters eg: /origin//log?path=test can be redirected as /log?url=&path=test. This can be used to deprecate routes """ request_path = resolve(request.path_info) args = {**request_path.kwargs, **request.GET.dict()} return redirect( reverse(new_route, query_params=args), permanent=permanent, ) diff --git a/swh/web/misc/iframe.py b/swh/web/misc/iframe.py index ab69e096..80e0758f 100644 --- a/swh/web/misc/iframe.py +++ b/swh/web/misc/iframe.py @@ -1,340 +1,340 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, List, Optional, Tuple from django.conf.urls import url from django.shortcuts import render from django.views.decorators.clickjacking import xframe_options_exempt from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.browse.utils import ( content_display_max_size, get_directory_entries, prepare_content_for_display, request_content, ) from swh.web.common import archive from swh.web.common.exc import BadInputExc, NotFoundExc, http_status_code_message from swh.web.common.identifiers import get_swhid, get_swhids_info from swh.web.common.typing import SnapshotContext, SWHObjectInfo from swh.web.common.utils import gen_path_info, reverse def _get_content_rendering_data(cnt_swhid: QualifiedSWHID, path: str) -> Dict[str, Any]: content_data = request_content(f"sha1_git:{cnt_swhid.object_id.hex()}") content = None language = None mimetype = None if content_data.get("raw_data") is not None: content_display_data = prepare_content_for_display( content_data["raw_data"], content_data["mimetype"], path ) content = content_display_data["content_data"] language = content_display_data["language"] mimetype = content_display_data["mimetype"] return { "content": content, "content_size": content_data.get("length"), "max_content_size": content_display_max_size, "filename": path.split("/")[-1], "encoding": content_data.get("encoding"), "mimetype": mimetype, "language": language, } def _get_directory_rendering_data( dir_swhid: QualifiedSWHID, focus_swhid: QualifiedSWHID, path: str, ) -> Dict[str, Any]: dirs, files = get_directory_entries(dir_swhid.object_id.hex()) for d in dirs: if d["type"] == "rev": d["url"] = None else: dir_swhid = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(d["target"]), origin=dir_swhid.origin, visit=dir_swhid.visit, anchor=dir_swhid.anchor, path=(path or "/") + d["name"] + "/", ) d["url"] = reverse( "swhid-iframe", url_args={"swhid": str(dir_swhid)}, query_params={"focus_swhid": str(focus_swhid)}, ) for f in files: object_id = hash_to_bytes(f["target"]) cnt_swhid = QualifiedSWHID( object_type=ObjectType.CONTENT, object_id=object_id, origin=dir_swhid.origin, visit=dir_swhid.visit, anchor=dir_swhid.anchor, path=(path or "/") + f["name"], lines=(focus_swhid.lines if object_id == focus_swhid.object_id else None), ) f["url"] = reverse( "swhid-iframe", url_args={"swhid": str(cnt_swhid)}, query_params={"focus_swhid": str(focus_swhid)}, ) return {"dirs": dirs, "files": files} def _get_breacrumbs_data( swhid: QualifiedSWHID, focus_swhid: QualifiedSWHID, path: str, snapshot_context: Optional[SnapshotContext] = None, ) -> Tuple[List[Dict[str, Any]], Optional[str]]: breadcrumbs = [] filename = None # strip any leading or trailing slash from path qualifier of SWHID if path and path[0] == "/": path = path[1:] if path and path[-1] == "/": path = path[:-1] if swhid.object_type == ObjectType.CONTENT: split_path = path.split("/") filename = split_path[-1] path = path[: -len(filename)] path_info = gen_path_info(path) if path != "/" else [] root_dir = None if snapshot_context and snapshot_context["root_directory"]: root_dir = snapshot_context["root_directory"] elif swhid.anchor and swhid.anchor.object_type == ObjectType.DIRECTORY: root_dir = swhid.anchor.object_id.hex() elif focus_swhid.object_type == ObjectType.DIRECTORY: root_dir = focus_swhid.object_id.hex() if root_dir: root_dir_swhid = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(root_dir), origin=swhid.origin, visit=swhid.visit, anchor=swhid.anchor, ) breadcrumbs.append( { "name": root_dir[:7], "object_id": root_dir_swhid.object_id.hex(), "path": "/", "url": reverse( "swhid-iframe", url_args={"swhid": str(root_dir_swhid)}, query_params={ - "focus_swhid": focus_swhid + "focus_swhid": str(focus_swhid) if focus_swhid != root_dir_swhid else None }, ), } ) for pi in path_info: dir_info = archive.lookup_directory_with_path(root_dir, pi["path"]) dir_swhid = QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(dir_info["target"]), origin=swhid.origin, visit=swhid.visit, anchor=swhid.anchor, path="/" + pi["path"] + "/", ) breadcrumbs.append( { "name": pi["name"], "object_id": dir_swhid.object_id.hex(), "path": dir_swhid.path.decode("utf-8") if dir_swhid.path else "", "url": reverse( "swhid-iframe", url_args={"swhid": str(dir_swhid)}, - query_params={"focus_swhid": focus_swhid}, + query_params={"focus_swhid": str(focus_swhid)}, ), } ) if filename: breadcrumbs.append( { "name": filename, "object_id": swhid.object_id.hex(), "path": path, "url": "", } ) return breadcrumbs, root_dir @xframe_options_exempt def swhid_iframe(request, swhid: str): """Django view that can be embedded in an iframe to display objects archived by Software Heritage (currently contents and directories) in a minimalist Web UI. """ focus_swhid = request.GET.get("focus_swhid", swhid) parsed_swhid = None view_data = {} breadcrumbs: List[Dict[str, Any]] = [] swh_objects = [] snapshot_context = None swhids_info_extra_context = {} archive_link = None try: parsed_swhid = get_swhid(swhid) parsed_focus_swhid = get_swhid(focus_swhid) path = parsed_swhid.path.decode("utf-8") if parsed_swhid.path else "" snapshot_context = None revision_id = None if ( parsed_swhid.anchor and parsed_swhid.anchor.object_type == ObjectType.REVISION ): revision_id = parsed_swhid.anchor.object_id.hex() if parsed_swhid.origin or parsed_swhid.visit: snapshot_context = get_snapshot_context( origin_url=parsed_swhid.origin, snapshot_id=parsed_swhid.visit.object_id.hex() if parsed_swhid.visit else None, revision_id=revision_id, ) error_info: Dict[str, Any] = {"status_code": 200, "description": ""} if parsed_swhid and parsed_swhid.object_type == ObjectType.CONTENT: view_data = _get_content_rendering_data(parsed_swhid, path) swh_objects.append( SWHObjectInfo( object_type=ObjectType.CONTENT, object_id=parsed_swhid.object_id.hex(), ) ) elif parsed_swhid and parsed_swhid.object_type == ObjectType.DIRECTORY: view_data = _get_directory_rendering_data( parsed_swhid, parsed_focus_swhid, path ) swh_objects.append( SWHObjectInfo( object_type=ObjectType.DIRECTORY, object_id=parsed_swhid.object_id.hex(), ) ) elif parsed_swhid: error_info = { "status_code": 400, "description": ( f"Objects of type {parsed_swhid.object_type} are not supported" ), } swhids_info_extra_context["path"] = path if parsed_swhid and view_data: breadcrumbs, root_dir = _get_breacrumbs_data( parsed_swhid, parsed_focus_swhid, path, snapshot_context ) if parsed_swhid.object_type == ObjectType.CONTENT and len(breadcrumbs) > 1: swh_objects.append( SWHObjectInfo( object_type=ObjectType.DIRECTORY, object_id=breadcrumbs[-2]["object_id"], ) ) swhids_info_extra_context["path"] = breadcrumbs[-2]["path"] swhids_info_extra_context["filename"] = breadcrumbs[-1]["name"] if snapshot_context: swh_objects.append( SWHObjectInfo( object_type=ObjectType.REVISION, object_id=snapshot_context["revision_id"] or "", ) ) swh_objects.append( SWHObjectInfo( object_type=ObjectType.SNAPSHOT, object_id=snapshot_context["snapshot_id"] or "", ) ) archive_link = reverse("browse-swhid", url_args={"swhid": swhid}) if ( parsed_swhid.origin is None and parsed_swhid.visit is None and parsed_swhid.anchor is None and root_dir is not None ): # qualifier values cannot be used to get root directory from them, # we need to add it as anchor in the SWHID argument of the archive link root_dir_swhid = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(root_dir) ) archive_swhid = QualifiedSWHID( object_type=parsed_swhid.object_type, object_id=parsed_swhid.object_id, path=parsed_swhid.path, anchor=root_dir_swhid, ) archive_link = reverse( "browse-swhid", url_args={"swhid": f"{archive_swhid}"}, ) except BadInputExc as e: error_info = {"status_code": 400, "description": f"BadInputExc: {str(e)}"} except NotFoundExc as e: error_info = {"status_code": 404, "description": f"NotFoundExc: {str(e)}"} except Exception as e: error_info = {"status_code": 500, "description": str(e)} return render( request, "misc/iframe.html", { **view_data, "iframe_mode": True, "object_type": parsed_swhid.object_type.value if parsed_swhid else None, "lines": parsed_swhid.lines if parsed_swhid else None, "breadcrumbs": breadcrumbs, "swhid": swhid, "focus_swhid": focus_swhid, "archive_link": archive_link, "error_code": error_info["status_code"], "error_message": http_status_code_message.get(error_info["status_code"]), "error_description": error_info["description"], "snapshot_context": None, "swhids_info": get_swhids_info( swh_objects, snapshot_context, swhids_info_extra_context ), }, status=error_info["status_code"], ) urlpatterns = [ url( r"^embed/(?Pswh:[0-9]+:[a-z]+:[0-9a-f]+.*)/$", swhid_iframe, name="swhid-iframe", ), ]