diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py index dac07c63..ff6d72ca 100644 --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -1,758 +1,744 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import magic import stat import textwrap from threading import Lock from django.core.cache import cache from django.utils.safestring import mark_safe from django.utils.html import escape import sentry_sdk from swh.web.common import highlightjs, service from swh.web.common.exc import http_status_code_message from swh.web.common.utils import ( reverse, format_utc_iso_date, rst_to_html, + browsers_supported_image_mimes, ) from swh.web.config import get_config def get_directory_entries(sha1_git): """Function that retrieves the content of a directory from the archive. The directories entries are first sorted in lexicographical order. Sub-directories and regular files are then extracted. Args: sha1_git: sha1_git identifier of the directory Returns: A tuple whose first member corresponds to the sub-directories list and second member the regular files list Raises: NotFoundExc if the directory is not found """ cache_entry_id = "directory_entries_%s" % sha1_git cache_entry = cache.get(cache_entry_id) if cache_entry: return cache_entry entries = list(service.lookup_directory(sha1_git)) for e in entries: e["perms"] = stat.filemode(e["perms"]) if e["type"] == "rev": # modify dir entry name to explicitly show it points # to a revision e["name"] = "%s @ %s" % (e["name"], e["target"][:7]) dirs = [e for e in entries if e["type"] in ("dir", "rev")] files = [e for e in entries if e["type"] == "file"] dirs = sorted(dirs, key=lambda d: d["name"]) files = sorted(files, key=lambda f: f["name"]) cache.set(cache_entry_id, (dirs, files)) return dirs, files _lock = Lock() def get_mimetype_and_encoding_for_content(content): """Function that returns the mime type and the encoding associated to a content buffer using the magic module under the hood. Args: content (bytes): a content buffer Returns: A tuple (mimetype, encoding), for instance ('text/plain', 'us-ascii'), associated to the provided content. """ # https://pypi.org/project/python-magic/ # packaged as python3-magic in debian buster if hasattr(magic, "from_buffer"): m = magic.Magic(mime=True, mime_encoding=True) mime_encoding = m.from_buffer(content) mime_type, encoding = mime_encoding.split(";") encoding = encoding.replace(" charset=", "") # https://pypi.org/project/file-magic/ # packaged as python3-magic in debian stretch else: # TODO: Remove that code when production environment is upgraded # to debian buster # calls to the file-magic API are not thread-safe so they must # be protected with a Lock to guarantee they will succeed _lock.acquire() magic_result = magic.detect_from_content(content) _lock.release() mime_type = magic_result.mime_type encoding = magic_result.encoding return mime_type, encoding # maximum authorized content size in bytes for HTML display # with code highlighting content_display_max_size = get_config()["content_display_max_size"] def _re_encode_content(mimetype, encoding, content_data): # encode textual content to utf-8 if needed if mimetype.startswith("text/"): # probably a malformed UTF-8 content, re-encode it # by replacing invalid chars with a substitution one if encoding == "unknown-8bit": content_data = content_data.decode("utf-8", "replace").encode("utf-8") elif encoding not in ["utf-8", "binary"]: content_data = content_data.decode(encoding, "replace").encode("utf-8") elif mimetype.startswith("application/octet-stream"): # file may detect a text content as binary # so try to decode it for display encodings = ["us-ascii", "utf-8"] encodings += ["iso-8859-%s" % i for i in range(1, 17)] for enc in encodings: try: content_data = content_data.decode(enc).encode("utf-8") except Exception as exc: sentry_sdk.capture_exception(exc) else: # ensure display in content view encoding = enc mimetype = "text/plain" break return mimetype, encoding, content_data def request_content( query_string, max_size=content_display_max_size, raise_if_unavailable=True, re_encode=True, ): """Function that retrieves a content from the archive. Raw bytes content is first retrieved, then the content mime type. If the mime type is not stored in the archive, it will be computed using Python magic module. Args: query_string: a string of the form "[ALGO_HASH:]HASH" where optional ALGO_HASH can be either ``sha1``, ``sha1_git``, ``sha256``, or ``blake2s256`` (default to ``sha1``) and HASH the hexadecimal representation of the hash value max_size: the maximum size for a content to retrieve (default to 1MB, no size limit if None) Returns: A tuple whose first member corresponds to the content raw bytes and second member the content mime type Raises: NotFoundExc if the content is not found """ content_data = service.lookup_content(query_string) filetype = None language = None license = None # requests to the indexer db may fail so properly handle # those cases in order to avoid content display errors try: filetype = service.lookup_content_filetype(query_string) language = service.lookup_content_language(query_string) license = service.lookup_content_license(query_string) except Exception as exc: sentry_sdk.capture_exception(exc) mimetype = "unknown" encoding = "unknown" if filetype: mimetype = filetype["mimetype"] encoding = filetype["encoding"] # workaround when encountering corrupted data due to implicit # conversion from bytea to text in the indexer db (see T818) # TODO: Remove that code when all data have been correctly converted if mimetype.startswith("\\"): filetype = None content_data["error_code"] = 200 content_data["error_message"] = "" content_data["error_description"] = "" if not max_size or content_data["length"] < max_size: try: content_raw = service.lookup_content_raw(query_string) except Exception as exc: if raise_if_unavailable: raise exc else: sentry_sdk.capture_exception(exc) content_data["raw_data"] = None content_data["error_code"] = 404 content_data["error_description"] = ( "The bytes of the content are currently not available " "in the archive." ) content_data["error_message"] = http_status_code_message[ content_data["error_code"] ] else: content_data["raw_data"] = content_raw["data"] if not filetype: mimetype, encoding = get_mimetype_and_encoding_for_content( content_data["raw_data"] ) if re_encode: mimetype, encoding, raw_data = _re_encode_content( mimetype, encoding, content_data["raw_data"] ) content_data["raw_data"] = raw_data else: content_data["raw_data"] = None content_data["mimetype"] = mimetype content_data["encoding"] = encoding if language: content_data["language"] = language["lang"] else: content_data["language"] = "not detected" if license: content_data["licenses"] = ", ".join(license["facts"][0]["licenses"]) else: content_data["licenses"] = "not detected" return content_data -_browsers_supported_image_mimes = set( - [ - "image/gif", - "image/png", - "image/jpeg", - "image/bmp", - "image/webp", - "image/svg", - "image/svg+xml", - ] -) - - def prepare_content_for_display(content_data, mime_type, path): """Function that prepares a content for HTML display. The function tries to associate a programming language to a content in order to perform syntax highlighting client-side using highlightjs. The language is determined using either the content filename or its mime type. If the mime type corresponds to an image format supported by web browsers, the content will be encoded in base64 for displaying the image. Args: content_data (bytes): raw bytes of the content mime_type (string): mime type of the content path (string): path of the content including filename Returns: A dict containing the content bytes (possibly different from the one provided as parameter if it is an image) under the key 'content_data and the corresponding highlightjs language class under the key 'language'. """ language = highlightjs.get_hljs_language_from_filename(path) if not language: language = highlightjs.get_hljs_language_from_mime_type(mime_type) if not language: language = "nohighlight" elif mime_type.startswith("application/"): mime_type = mime_type.replace("application/", "text/") if mime_type.startswith("image/"): - if mime_type in _browsers_supported_image_mimes: + if mime_type in browsers_supported_image_mimes: content_data = base64.b64encode(content_data).decode("ascii") - else: - content_data = None if mime_type.startswith("image/svg"): mime_type = "image/svg+xml" if mime_type.startswith("text/"): content_data = content_data.decode("utf-8", errors="replace") return {"content_data": content_data, "language": language, "mimetype": mime_type} def gen_link(url, link_text=None, link_attrs=None): """ Utility function for generating an HTML link to insert in Django templates. Args: url (str): an url link_text (str): optional text for the produced link, if not provided the url will be used link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ attrs = " " if link_attrs: for k, v in link_attrs.items(): attrs += '%s="%s" ' % (k, v) if not link_text: link_text = url link = '%s' % (attrs, escape(url), escape(link_text)) return mark_safe(link) def _snapshot_context_query_params(snapshot_context): query_params = None if snapshot_context and snapshot_context["origin_info"]: origin_info = snapshot_context["origin_info"] query_params = {"origin_url": origin_info["url"]} if "timestamp" in snapshot_context["query_params"]: query_params["timestamp"] = snapshot_context["query_params"]["timestamp"] if "visit_id" in snapshot_context["query_params"]: query_params["visit_id"] = snapshot_context["query_params"]["visit_id"] elif snapshot_context: query_params = {"snapshot_id": snapshot_context["snapshot_id"]} return query_params def gen_revision_url(revision_id, snapshot_context=None): """ Utility function for generating an url to a revision. Args: revision_id (str): a revision id snapshot_context (dict): if provided, generate snapshot-dependent browsing url Returns: str: The url to browse the revision """ query_params = _snapshot_context_query_params(snapshot_context) return reverse( "browse-revision", url_args={"sha1_git": revision_id}, query_params=query_params ) def gen_revision_link( revision_id, shorten_id=False, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a revision HTML view to insert in Django templates. Args: revision_id (str): a revision id shorten_id (boolean): whether to shorten the revision id to 7 characters for the link text snapshot_context (dict): if provided, generate snapshot-dependent browsing link link_text (str): optional text for the generated link (the revision id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: str: An HTML link in the form 'revision_id' """ if not revision_id: return None revision_url = gen_revision_url(revision_id, snapshot_context) if shorten_id: return gen_link(revision_url, revision_id[:7], link_attrs) else: if not link_text: link_text = revision_id return gen_link(revision_url, link_text, link_attrs) def gen_directory_link( sha1_git, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a directory HTML view to insert in Django templates. Args: sha1_git (str): directory identifier link_text (str): optional text for the generated link (the directory id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ if not sha1_git: return None query_params = _snapshot_context_query_params(snapshot_context) directory_url = reverse( "browse-directory", url_args={"sha1_git": sha1_git}, query_params=query_params ) if not link_text: link_text = sha1_git return gen_link(directory_url, link_text, link_attrs) def gen_snapshot_link( snapshot_id, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a snapshot HTML view to insert in Django templates. Args: snapshot_id (str): snapshot identifier link_text (str): optional text for the generated link (the snapshot id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ query_params = _snapshot_context_query_params(snapshot_context) snapshot_url = reverse( "browse-snapshot", url_args={"snapshot_id": snapshot_id}, query_params=query_params, ) if not link_text: link_text = snapshot_id return gen_link(snapshot_url, link_text, link_attrs) def gen_content_link( sha1_git, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a content HTML view to insert in Django templates. Args: sha1_git (str): content identifier link_text (str): optional text for the generated link (the content sha1_git will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ if not sha1_git: return None query_params = _snapshot_context_query_params(snapshot_context) content_url = reverse( "browse-content", url_args={"query_string": "sha1_git:" + sha1_git}, query_params=query_params, ) if not link_text: link_text = sha1_git return gen_link(content_url, link_text, link_attrs) def get_revision_log_url(revision_id, snapshot_context=None): """ Utility function for getting the URL for a revision log HTML view (possibly in the context of an origin). Args: revision_id (str): revision identifier the history heads to snapshot_context (dict): if provided, generate snapshot-dependent browsing link Returns: The revision log view URL """ query_params = {"revision": revision_id} if snapshot_context and snapshot_context["origin_info"]: origin_info = snapshot_context["origin_info"] query_params["origin_url"] = origin_info["url"] if "timestamp" in snapshot_context["query_params"]: query_params["timestamp"] = snapshot_context["query_params"]["timestamp"] if "visit_id" in snapshot_context["query_params"]: query_params["visit_id"] = snapshot_context["query_params"]["visit_id"] revision_log_url = reverse("browse-origin-log", query_params=query_params) elif snapshot_context: url_args = {"snapshot_id": snapshot_context["snapshot_id"]} revision_log_url = reverse( "browse-snapshot-log", url_args=url_args, query_params=query_params ) else: revision_log_url = reverse( "browse-revision-log", url_args={"sha1_git": revision_id} ) return revision_log_url def gen_revision_log_link( revision_id, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a revision log HTML view (possibly in the context of an origin) to insert in Django templates. Args: revision_id (str): revision identifier the history heads to snapshot_context (dict): if provided, generate snapshot-dependent browsing link link_text (str): optional text to use for the generated link (the revision id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ if not revision_id: return None revision_log_url = get_revision_log_url(revision_id, snapshot_context) if not link_text: link_text = revision_id return gen_link(revision_log_url, link_text, link_attrs) def gen_person_mail_link(person, link_text=None): """ Utility function for generating a mail link to a person to insert in Django templates. Args: person (dict): dictionary containing person data (*name*, *email*, *fullname*) link_text (str): optional text to use for the generated mail link (the person name will be used by default) Returns: str: A mail link to the person or the person name if no email is present in person data """ person_name = person["name"] or person["fullname"] or "None" if link_text is None: link_text = person_name person_email = person["email"] if person["email"] else None if person_email is None and "@" in person_name and " " not in person_name: person_email = person_name if person_email: return gen_link(url="mailto:%s" % person_email, link_text=link_text) else: return person_name def gen_release_link( sha1_git, snapshot_context=None, link_text="Browse", link_attrs={"class": "btn btn-default btn-sm", "role": "button"}, ): """ Utility function for generating a link to a release HTML view to insert in Django templates. Args: sha1_git (str): release identifier link_text (str): optional text for the generated link (the release id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ query_params = _snapshot_context_query_params(snapshot_context) release_url = reverse( "browse-release", url_args={"sha1_git": sha1_git}, query_params=query_params ) if not link_text: link_text = sha1_git return gen_link(release_url, link_text, link_attrs) def format_log_entries(revision_log, per_page, snapshot_context=None): """ Utility functions that process raw revision log data for HTML display. Its purpose is to: * add links to relevant browse views * format date in human readable format * truncate the message log Args: revision_log (list): raw revision log as returned by the swh-web api per_page (int): number of log entries per page snapshot_context (dict): if provided, generate snapshot-dependent browsing link """ revision_log_data = [] for i, rev in enumerate(revision_log): if i == per_page: break author_name = "None" author_fullname = "None" committer_fullname = "None" if rev["author"]: author_name = gen_person_mail_link(rev["author"]) author_fullname = rev["author"]["fullname"] if rev["committer"]: committer_fullname = rev["committer"]["fullname"] author_date = format_utc_iso_date(rev["date"]) committer_date = format_utc_iso_date(rev["committer_date"]) tooltip = "revision %s\n" % rev["id"] tooltip += "author: %s\n" % author_fullname tooltip += "author date: %s\n" % author_date tooltip += "committer: %s\n" % committer_fullname tooltip += "committer date: %s\n\n" % committer_date if rev["message"]: tooltip += textwrap.indent(rev["message"], " " * 4) revision_log_data.append( { "author": author_name, "id": rev["id"][:7], "message": rev["message"], "date": author_date, "commit_date": committer_date, "url": gen_revision_url(rev["id"], snapshot_context), "tooltip": tooltip, } ) return revision_log_data # list of common readme names ordered by preference # (lower indices have higher priority) _common_readme_names = [ "readme.markdown", "readme.md", "readme.rst", "readme.txt", "readme", ] def get_readme_to_display(readmes): """ Process a list of readme files found in a directory in order to find the adequate one to display. Args: readmes: a list of dict where keys are readme file names and values are readme sha1s Returns: A tuple (readme_name, readme_sha1) """ readme_name = None readme_url = None readme_sha1 = None readme_html = None lc_readmes = {k.lower(): {"orig_name": k, "sha1": v} for k, v in readmes.items()} # look for readme names according to the preference order # defined by the _common_readme_names list for common_readme_name in _common_readme_names: if common_readme_name in lc_readmes: readme_name = lc_readmes[common_readme_name]["orig_name"] readme_sha1 = lc_readmes[common_readme_name]["sha1"] readme_url = reverse( "browse-content-raw", url_args={"query_string": readme_sha1}, query_params={"re_encode": "true"}, ) break # otherwise pick the first readme like file if any if not readme_name and len(readmes.items()) > 0: readme_name = next(iter(readmes)) readme_sha1 = readmes[readme_name] readme_url = reverse( "browse-content-raw", url_args={"query_string": readme_sha1}, query_params={"re_encode": "true"}, ) # convert rst README to html server side as there is # no viable solution to perform that task client side if readme_name and readme_name.endswith(".rst"): cache_entry_id = "readme_%s" % readme_sha1 cache_entry = cache.get(cache_entry_id) if cache_entry: readme_html = cache_entry else: try: rst_doc = request_content(readme_sha1) readme_html = rst_to_html(rst_doc["raw_data"]) cache.set(cache_entry_id, readme_html) except Exception as exc: sentry_sdk.capture_exception(exc) readme_html = "Readme bytes are not available" return readme_name, readme_url, readme_html diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py index 0d119c7e..65fe3b8a 100644 --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -1,350 +1,364 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import re from datetime import datetime, timezone from dateutil import parser as date_parser from dateutil import tz from typing import Optional, Dict, Any import docutils.parsers.rst import docutils.utils from bs4 import BeautifulSoup from docutils.core import publish_parts from docutils.writers.html5_polyglot import Writer, HTMLTranslator from django.urls import reverse as django_reverse from django.http import QueryDict, HttpRequest from prometheus_client.registry import CollectorRegistry from rest_framework.authentication import SessionAuthentication from swh.web.common.exc import BadInputExc from swh.web.common.typing import QueryParameters from swh.web.config import get_config SWH_WEB_METRICS_REGISTRY = CollectorRegistry(auto_describe=True) swh_object_icons = { "branch": "fa fa-code-fork", "branches": "fa fa-code-fork", "content": "fa fa-file-text", "directory": "fa fa-folder", "person": "fa fa-user", "revisions history": "fa fa-history", "release": "fa fa-tag", "releases": "fa fa-tag", "revision": "octicon-git-commit", "snapshot": "fa fa-camera", "visits": "fa fa-calendar", } def reverse( viewname: str, url_args: Optional[Dict[str, Any]] = None, query_params: Optional[QueryParameters] = None, current_app: Optional[str] = None, urlconf: Optional[str] = None, request: Optional[HttpRequest] = None, ) -> str: """An override of django reverse function supporting query parameters. Args: viewname: the name of the django view from which to compute a url url_args: dictionary of url arguments indexed by their names query_params: dictionary of query parameters to append to the reversed url current_app: the name of the django app tighten to the view urlconf: url configuration module request: build an absolute URI if provided Returns: str: the url of the requested view with processed arguments and query parameters """ if url_args: url_args = {k: v for k, v in url_args.items() if v is not None} url = django_reverse( viewname, urlconf=urlconf, kwargs=url_args, current_app=current_app ) if query_params: query_params = {k: v for k, v in query_params.items() if v} if query_params and len(query_params) > 0: query_dict = QueryDict("", mutable=True) for k in sorted(query_params.keys()): query_dict[k] = query_params[k] url += "?" + query_dict.urlencode(safe="/;:") if request is not None: url = request.build_absolute_uri(url) return url def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datetime: datetime in UTC without timezone info """ if date.tzinfo: return date.astimezone(tz.gettz("UTC")).replace(tzinfo=timezone.utc) else: return date def parse_timestamp(timestamp): """Given a time or timestamp (as string), parse the result as UTC datetime. Returns: datetime.datetime: a timezone-aware datetime representing the parsed value or None if the parsing fails. Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - Today is January 1, 2047 at 8:21:00AM - 1452591542 """ if not timestamp: return None try: date = date_parser.parse(timestamp, ignoretz=False, fuzzy=True) return datetime_to_utc(date) except Exception: try: return datetime.utcfromtimestamp(float(timestamp)).replace( tzinfo=timezone.utc ) except (ValueError, OverflowError) as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r"([0-9a-f]{8})[0-9a-z]{56}" sha1_re = r"([0-9a-f]{8})[0-9a-f]{32}" ret = re.sub(sha256_re, r"\1...", path) return re.sub(sha1_re, r"\1...", ret) def format_utc_iso_date(iso_date, fmt="%d %B %Y, %H:%M UTC"): """Turns a string representation of an ISO 8601 date string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: str: a formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_timestamp(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: list: a list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip("/").split("/") path_from_root = "" for p in sub_paths: path_from_root += "/" + p path_info.append({"name": p, "path": path_from_root.strip("/")}) return path_info def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components ).get_default_values() settings.report_level = report_level document = docutils.utils.new_document("rst-doc", settings=settings) parser.parse(text, document) return document def get_client_ip(request): """ Return the client IP address from an incoming HTTP request. Args: request (django.http.HttpRequest): the incoming HTTP request Returns: str: The client IP address """ x_forwarded_for = request.META.get("HTTP_X_FORWARDED_FOR") if x_forwarded_for: ip = x_forwarded_for.split(",")[0] else: ip = request.META.get("REMOTE_ADDR") return ip +browsers_supported_image_mimes = set( + [ + "image/gif", + "image/png", + "image/jpeg", + "image/bmp", + "image/webp", + "image/svg", + "image/svg+xml", + ] +) + + def context_processor(request): """ Django context processor used to inject variables in all swh-web templates. """ config = get_config() if ( hasattr(request, "user") and request.user.is_authenticated and not hasattr(request.user, "backend") ): # To avoid django.template.base.VariableDoesNotExist errors # when rendering templates when standard Django user is logged in. request.user.backend = "django.contrib.auth.backends.ModelBackend" return { "swh_object_icons": swh_object_icons, "available_languages": None, "swh_client_config": config["client_config"], "oidc_enabled": bool(config["keycloak"]["server_url"]), + "browsers_supported_image_mimes": browsers_supported_image_mimes, } class EnforceCSRFAuthentication(SessionAuthentication): """ Helper class to enforce CSRF validation on a DRF view when a user is not authenticated. """ def authenticate(self, request): user = getattr(request._request, "user", None) self.enforce_csrf(request) return (user, None) def resolve_branch_alias( snapshot: Dict[str, Any], branch: Optional[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: """ Resolve branch alias in snapshot content. Args: snapshot: a full snapshot content branch: a branch alias contained in the snapshot Returns: The real snapshot branch that got aliased. """ while branch and branch["target_type"] == "alias": if branch["target"] in snapshot["branches"]: branch = snapshot["branches"][branch["target"]] else: from swh.web.common import service snp = service.lookup_snapshot( snapshot["id"], branches_from=branch["target"], branches_count=1 ) if snp and branch["target"] in snp["branches"]: branch = snp["branches"][branch["target"]] else: branch = None return branch class _NoHeaderHTMLTranslator(HTMLTranslator): """ Docutils translator subclass to customize the generation of HTML from reST-formatted docstrings """ def __init__(self, document): super().__init__(document) self.body_prefix = [] self.body_suffix = [] _HTML_WRITER = Writer() _HTML_WRITER.translator_class = _NoHeaderHTMLTranslator def rst_to_html(rst: str) -> str: """ Convert reStructuredText document into HTML. Args: rst: A string containing a reStructuredText document Returns: Body content of the produced HTML conversion. """ settings = { "initial_header_level": 2, } pp = publish_parts(rst, writer=_HTML_WRITER, settings_overrides=settings) return f'
{pp["html_body"]}
' def prettify_html(html: str) -> str: """ Prettify an HTML document. Args: html: Input HTML document Returns: The prettified HTML document """ return BeautifulSoup(html, "lxml").prettify() diff --git a/swh/web/templates/includes/content-display.html b/swh/web/templates/includes/content-display.html index 40eed9f2..ed815f75 100644 --- a/swh/web/templates/includes/content-display.html +++ b/swh/web/templates/includes/content-display.html @@ -1,78 +1,77 @@ {% comment %} Copyright (C) 2017-2019 The Software Heritage developers See the AUTHORS file at the top-level directory of this distribution License: GNU Affero General Public License version 3, or any later version See top-level LICENSE file for more information {% endcomment %} {% load swh_templatetags %} {% if snapshot_context and snapshot_context.is_empty %} {% include "includes/empty-snapshot.html" %} {% else %}
{% if filename %}
{{ filename }}
{% endif %}
{% if content_size > max_content_size %} Content is too large to be displayed (size is greater than {{ max_content_size|filesizeformat }}). {% elif "inode/x-empty" == mimetype %} File is empty {% elif filename and filename|default:""|slice:"-5:" == "ipynb" %}
{% elif "text/" in mimetype and encoding != "binary" %}
{{ content }}
- {% elif "image/" in mimetype and content %} + {% elif mimetype in browsers_supported_image_mimes and content %} {% elif "application/pdf" == mimetype %}
Page: /
{% elif content %} - Content with mime type {{ mimetype }} and encoding - {{ encoding }} cannot be displayed. + Content with mime type {{ mimetype }} and encoding {{ encoding }} cannot be displayed. {% else %} {% include "includes/http-error.html" %} {% endif %}
{% endif %} diff --git a/swh/web/tests/browse/views/test_content.py b/swh/web/tests/browse/views/test_content.py index 27f237ec..b9290589 100644 --- a/swh/web/tests/browse/views/test_content.py +++ b/swh/web/tests/browse/views/test_content.py @@ -1,404 +1,425 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import textwrap from django.utils.html import escape from hypothesis import given from swh.web.browse.utils import ( get_mimetype_and_encoding_for_content, prepare_content_for_display, _re_encode_content, ) from swh.web.common.exc import NotFoundExc from swh.web.common.identifiers import get_swh_persistent_id from swh.web.common.utils import gen_path_info, reverse from swh.web.tests.django_asserts import ( assert_contains, assert_not_contains, assert_template_used, ) from swh.web.tests.strategies import ( content, content_text_non_utf8, content_text_no_highlight, content_image_type, + content_unsupported_image_type_rendering, content_text, invalid_sha1, unknown_content, content_utf8_detected_as_binary, ) @given(content_text()) def test_content_view_text(client, archive_data, content): sha1_git = content["sha1_git"] url = reverse( "browse-content", url_args={"query_string": content["sha1"]}, query_params={"path": content["path"]}, ) url_raw = reverse("browse-content-raw", url_args={"query_string": content["sha1"]}) resp = client.get(url) content_display = _process_content_for_display(archive_data, content) mimetype = content_display["mimetype"] assert resp.status_code == 200 assert_template_used(resp, "browse/content.html") if mimetype.startswith("text/"): assert_contains(resp, '' % content_display["language"]) assert_contains(resp, escape(content_display["content_data"])) assert_contains(resp, url_raw) swh_cnt_id = get_swh_persistent_id("content", sha1_git) swh_cnt_id_url = reverse("browse-swh-id", url_args={"swh_id": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) assert_contains( resp, textwrap.indent( ( f"Browse archived content\n" f'\n' f" {swh_cnt_id}\n" f"" ), " " * 4, ), ) @given(content_text_no_highlight()) def test_content_view_text_no_highlight(client, archive_data, content): sha1_git = content["sha1_git"] url = reverse("browse-content", url_args={"query_string": content["sha1"]}) url_raw = reverse("browse-content-raw", url_args={"query_string": content["sha1"]}) resp = client.get(url) content_display = _process_content_for_display(archive_data, content) assert resp.status_code == 200 assert_template_used(resp, "browse/content.html") assert_contains(resp, '') assert_contains(resp, escape(content_display["content_data"])) assert_contains(resp, url_raw) swh_cnt_id = get_swh_persistent_id("content", sha1_git) swh_cnt_id_url = reverse("browse-swh-id", url_args={"swh_id": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) @given(content_text_non_utf8()) def test_content_view_no_utf8_text(client, archive_data, content): sha1_git = content["sha1_git"] url = reverse("browse-content", url_args={"query_string": content["sha1"]}) resp = client.get(url) content_display = _process_content_for_display(archive_data, content) assert resp.status_code == 200 assert_template_used(resp, "browse/content.html") swh_cnt_id = get_swh_persistent_id("content", sha1_git) swh_cnt_id_url = reverse("browse-swh-id", url_args={"swh_id": swh_cnt_id}) assert_contains(resp, swh_cnt_id_url) assert_contains(resp, escape(content_display["content_data"])) @given(content_image_type()) def test_content_view_image(client, archive_data, content): url = reverse("browse-content", url_args={"query_string": content["sha1"]}) url_raw = reverse("browse-content-raw", url_args={"query_string": content["sha1"]}) resp = client.get(url) content_display = _process_content_for_display(archive_data, content) mimetype = content_display["mimetype"] content_data = content_display["content_data"] assert resp.status_code == 200 assert_template_used(resp, "browse/content.html") assert_contains(resp, '' % (mimetype, content_data)) assert_contains(resp, url_raw) +@given(content_unsupported_image_type_rendering()) +def test_content_view_image_no_rendering(client, archive_data, content): + url = reverse("browse-content", url_args={"query_string": content["sha1"]}) + + resp = client.get(url) + + mimetype = content["mimetype"] + encoding = content["encoding"] + + assert resp.status_code == 200 + assert_template_used(resp, "browse/content.html") + assert_contains( + resp, + ( + f"Content with mime type {mimetype} and encoding {encoding} " + "cannot be displayed." + ), + ) + + @given(content_text()) def test_content_view_text_with_path(client, archive_data, content): path = content["path"] url = reverse( "browse-content", url_args={"query_string": content["sha1"]}, query_params={"path": path}, ) resp = client.get(url) assert resp.status_code == 200 assert_template_used(resp, "browse/content.html") assert_contains(resp, '