diff --git a/swh/web/api/views/content.py b/swh/web/api/views/content.py index 6936b23e..720e5b1a 100644 --- a/swh/web/api/views/content.py +++ b/swh/web/api/views/content.py @@ -1,342 +1,343 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import io +import os from typing import Optional from django.http import FileResponse from rest_framework.request import Request from swh.web.api import utils from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup from swh.web.utils import archive from swh.web.utils.exc import NotFoundExc @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/filetype/", "api-1-content-filetype", checksum_args=["q"], ) @api_doc("/content/filetype/", category="Metadata") @format_docstring() def api_content_filetype(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/filetype/ Get information about the detected MIME type of a content object. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :>json object content_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/` for getting information about the content :>json string encoding: the detected content encoding :>json string id: the **sha1** identifier of the content :>json string mimetype: the detected MIME type of the content :>json object tool: information about the tool used to detect the content filetype {common_headers} :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/filetype/` """ return api_lookup( archive.lookup_content_filetype, q, notfound_msg="No filetype information found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/language/", "api-1-content-language", checksum_args=["q"], ) @api_doc("/content/language/", category="Metadata") @format_docstring() def api_content_language(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/language/ Get information about the programming language used in a content object. Note: this endpoint currently returns no data. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :>json object content_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/` for getting information about the content :>json string id: the **sha1** identifier of the content :>json string lang: the detected programming language if any :>json object tool: information about the tool used to detect the programming language {common_headers} :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/language/` """ return api_lookup( archive.lookup_content_language, q, notfound_msg="No language information found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/license/", "api-1-content-license", checksum_args=["q"], ) @api_doc("/content/license/", category="Metadata") @format_docstring() def api_content_license(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/license/ Get information about the license of a content object. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :>json object content_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/` for getting information about the content :>json string id: the **sha1** identifier of the content :>json array licenses: array of strings containing the detected license names :>json object tool: information about the tool used to detect the license {common_headers} :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/license/` """ return api_lookup( archive.lookup_content_license, q, notfound_msg="No license information found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/raw/", "api-1-content-raw", checksum_args=["q"], ) @api_doc("/content/raw/", category="Archive") def api_content_raw(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/raw/ Get the raw content of a content object (aka a "blob"), as a byte sequence. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :query string filename: if provided, the downloaded content will get that filename :resheader Content-Type: application/octet-stream :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/raw/` """ content_raw = archive.lookup_content_raw(q) if not content_raw: raise NotFoundExc("Content %s is not found." % q) filename = request.query_params.get("filename") if not filename: filename = "content_%s_raw" % q.replace(":", "_") return FileResponse( io.BytesIO(content_raw["data"]), # not copied, as this is never modified - filename=filename, + filename=os.path.basename(filename), content_type="application/octet-stream", as_attachment=True, ) @api_route(r"/content/known/search/", "api-1-content-known", methods=["POST"]) @api_route(r"/content/known/(?P(?!search).+)/", "api-1-content-known") @api_doc("/content/known/", category="Archive", tags=["hidden"]) @format_docstring() def api_check_content_known(request: Request, q: Optional[str] = None): """ .. http:get:: /api/1/content/known/(sha1)[,(sha1), ...,(sha1)]/ Check whether some content(s) (aka "blob(s)") is present in the archive based on its **sha1** checksum. :param string sha1: hexadecimal representation of the **sha1** checksum value for the content to check existence. Multiple values can be provided separated by ','. {common_headers} :>json array search_res: array holding the search result for each provided **sha1** :>json object search_stats: some statistics regarding the number of **sha1** provided and the percentage of those found in the archive :statuscode 200: no error :statuscode 400: an invalid **sha1** has been provided **Example:** .. parsed-literal:: :swh_web_api:`content/known/dc2830a9e72f23c1dfebef4413003221baa5fb62,0c3f19cb47ebfbe643fb19fa94c874d18fa62d12/` """ search_stats = {"nbfiles": 0, "pct": 0} search_res = None queries = [] # GET: Many hash separated values request if q: hashes = q.split(",") for v in hashes: queries.append({"filename": None, "sha1": v}) # POST: Many hash requests in post form submission elif request.method == "POST": data = request.data # Remove potential inputs with no associated value for k, v in data.items(): if v is not None: if k == "q" and len(v) > 0: queries.append({"filename": None, "sha1": v}) elif v != "": queries.append({"filename": k, "sha1": v}) if queries: lookup = archive.lookup_multiple_hashes(queries) result = [] nb_queries = len(queries) for el in lookup: res_d = {"sha1": el["sha1"], "found": el["found"]} if "filename" in el and el["filename"]: res_d["filename"] = el["filename"] result.append(res_d) search_res = result nbfound = len([x for x in lookup if x["found"]]) search_stats["nbfiles"] = nb_queries search_stats["pct"] = int((nbfound / nb_queries) * 100) return {"search_res": search_res, "search_stats": search_stats} @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/", "api-1-content", checksum_args=["q"] ) @api_doc("/content/", category="Archive") @format_docstring() def api_content_metadata(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/ Get information about a content (aka a "blob") object. In the archive, a content object is identified based on checksum values computed using various hashing algorithms. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. {common_headers} :>json object checksums: object holding the computed checksum values for the requested content :>json string data_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/raw/` for downloading the content raw bytes :>json string filetype_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/filetype/` for getting information about the content MIME type :>json string language_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/language/` for getting information about the programming language used in the content :>json number length: length of the content in bytes :>json string license_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/license/` for getting information about the license of the content :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1_git:fe95a46679d128ff167b7c55df5d02356c5a1ae1/` """ return api_lookup( archive.lookup_content, q, notfound_msg="Content with {} not found.".format(q), enrich_fn=functools.partial(utils.enrich_content, query_string=q), request=request, ) diff --git a/swh/web/browse/tests/views/test_content.py b/swh/web/browse/tests/views/test_content.py index 8b07c07c..e89a941a 100644 --- a/swh/web/browse/tests/views/test_content.py +++ b/swh/web/browse/tests/views/test_content.py @@ -1,1115 +1,1116 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import random import re import pytest from django.utils.html import escape from swh.model.hashutil import hash_to_bytes from swh.model.model import ObjectType as ModelObjectType from swh.model.model import Release, Snapshot, SnapshotBranch, TargetType from swh.model.swhids import ObjectType from swh.web.browse.snapshot_context import process_snapshot_branches from swh.web.browse.utils import ( get_mimetype_and_encoding_for_content, prepare_content_for_display, re_encode_content, ) from swh.web.tests.data import get_content from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.helpers import check_html_get_response, check_http_get_response from swh.web.utils import ( format_utc_iso_date, gen_path_info, parse_iso8601_date_to_utc, reverse, ) from swh.web.utils.exc import NotFoundExc from swh.web.utils.identifiers import gen_swhid def test_content_view_text(client, archive_data, content_text): sha1_git = content_text["sha1_git"] url = reverse( "browse-content", url_args={"query_string": content_text["sha1"]}, query_params={"path": content_text["path"]}, ) url_raw = reverse( "browse-content-raw", url_args={"query_string": content_text["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse-content.html" ) content_display = _process_content_for_display(archive_data, content_text) mimetype = content_display["mimetype"] if mimetype.startswith("text/"): assert_contains(resp, '' % content_display["language"]) assert_contains(resp, escape(content_display["content_data"])) assert_contains(resp, url_raw) swh_cnt_id = gen_swhid(ObjectType.CONTENT, sha1_git) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) assert_not_contains(resp, "swh-metadata-popover") def test_content_view_no_highlight( client, archive_data, content_application_no_highlight, content_text_no_highlight ): for content_ in (content_application_no_highlight, content_text_no_highlight): content = content_ sha1_git = content["sha1_git"] url = reverse("browse-content", url_args={"query_string": content["sha1"]}) url_raw = reverse( "browse-content-raw", url_args={"query_string": content["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse-content.html" ) content_display = _process_content_for_display(archive_data, content) if content["encoding"] != "binary": assert_contains(resp, '') assert_contains(resp, escape(content_display["content_data"])) assert_contains(resp, url_raw) swh_cnt_id = gen_swhid(ObjectType.CONTENT, sha1_git) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) def test_content_view_no_utf8_text(client, archive_data, content_text_non_utf8): sha1_git = content_text_non_utf8["sha1_git"] url = reverse( "browse-content", url_args={"query_string": content_text_non_utf8["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse-content.html" ) content_display = _process_content_for_display(archive_data, content_text_non_utf8) swh_cnt_id = gen_swhid(ObjectType.CONTENT, sha1_git) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id_url) assert_contains(resp, escape(content_display["content_data"])) def test_content_view_image(client, archive_data, content_image_type): url = reverse( "browse-content", url_args={"query_string": content_image_type["sha1"]} ) url_raw = reverse( "browse-content-raw", url_args={"query_string": content_image_type["sha1"]} ) resp = check_html_get_response( client, url, status_code=200, template_used="browse-content.html" ) content_display = _process_content_for_display(archive_data, content_image_type) mimetype = content_display["mimetype"] content_data = content_display["content_data"] assert_contains(resp, '' % (mimetype, content_data)) assert_contains(resp, url_raw) def test_content_view_image_no_rendering( client, archive_data, content_unsupported_image_type_rendering ): url = reverse( "browse-content", url_args={"query_string": content_unsupported_image_type_rendering["sha1"]}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse-content.html" ) mimetype = content_unsupported_image_type_rendering["mimetype"] encoding = content_unsupported_image_type_rendering["encoding"] assert_contains( resp, ( f"Content with mime type {mimetype} and encoding {encoding} " "cannot be displayed." ), ) def test_content_view_text_with_path(client, archive_data, content_text): path = content_text["path"] url = reverse( "browse-content", url_args={"query_string": content_text["sha1"]}, query_params={"path": path}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse-content.html" ) assert_contains(resp, '