diff --git a/swh/web/api/views/content.py b/swh/web/api/views/content.py index bcbdddda..2150d123 100644 --- a/swh/web/api/views/content.py +++ b/swh/web/api/views/content.py @@ -1,408 +1,406 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import functools +import io from typing import Optional -from django.http import HttpResponse +from django.http import FileResponse from rest_framework.request import Request from swh.web.api import utils from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup from swh.web.common import archive from swh.web.common.exc import NotFoundExc from swh.web.common.utils import reverse @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/filetype/", "api-1-content-filetype", checksum_args=["q"], ) @api_doc("/content/filetype/") @format_docstring() def api_content_filetype(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/filetype/ Get information about the detected MIME type of a content object. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :>json object content_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/` for getting information about the content :>json string encoding: the detected content encoding :>json string id: the **sha1** identifier of the content :>json string mimetype: the detected MIME type of the content :>json object tool: information about the tool used to detect the content filetype {common_headers} :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/filetype/` """ return api_lookup( archive.lookup_content_filetype, q, notfound_msg="No filetype information found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/language/", "api-1-content-language", checksum_args=["q"], ) @api_doc("/content/language/") @format_docstring() def api_content_language(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/language/ Get information about the programming language used in a content object. Note: this endpoint currently returns no data. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :>json object content_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/` for getting information about the content :>json string id: the **sha1** identifier of the content :>json string lang: the detected programming language if any :>json object tool: information about the tool used to detect the programming language {common_headers} :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/language/` """ return api_lookup( archive.lookup_content_language, q, notfound_msg="No language information found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/license/", "api-1-content-license", checksum_args=["q"], ) @api_doc("/content/license/") @format_docstring() def api_content_license(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/license/ Get information about the license of a content object. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :>json object content_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/` for getting information about the content :>json string id: the **sha1** identifier of the content :>json array licenses: array of strings containing the detected license names :>json object tool: information about the tool used to detect the license {common_headers} :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/license/` """ return api_lookup( archive.lookup_content_license, q, notfound_msg="No license information found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route(r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/ctags/", "api-1-content-ctags") @api_doc("/content/ctags/", tags=["hidden"]) def api_content_ctags(request: Request, q: str): """ Get information about all `Ctags `_-style symbols defined in a content object. """ return api_lookup( archive.lookup_content_ctags, q, notfound_msg="No ctags symbol found for content {}.".format(q), enrich_fn=utils.enrich_metadata_endpoint, request=request, ) @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/raw/", "api-1-content-raw", checksum_args=["q"], ) @api_doc("/content/raw/") def api_content_raw(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/raw/ Get the raw content of a content object (aka a "blob"), as a byte sequence. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. :query string filename: if provided, the downloaded content will get that filename :resheader Content-Type: application/octet-stream :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1:dc2830a9e72f23c1dfebef4413003221baa5fb62/raw/` """ - - def generate(content): - yield content["data"] - content_raw = archive.lookup_content_raw(q) if not content_raw: raise NotFoundExc("Content %s is not found." % q) filename = request.query_params.get("filename") if not filename: filename = "content_%s_raw" % q.replace(":", "_") - response = HttpResponse( - generate(content_raw), content_type="application/octet-stream" + return FileResponse( + io.BytesIO(content_raw["data"]), # not copied, as this is never modified + filename=filename, + content_type="application/octet-stream", + as_attachment=True, ) - response["Content-disposition"] = "attachment; filename=%s" % filename - return response @api_route(r"/content/symbol/(?P.+)/", "api-1-content-symbol") @api_doc("/content/symbol/", tags=["hidden"]) def api_content_symbol(request: Request, q: str): """Search content objects by `Ctags `_-style symbol (e.g., function name, data type, method, ...). """ result = {} last_sha1 = request.query_params.get("last_sha1", None) per_page = int(request.query_params.get("per_page", "10")) def lookup_exp(exp, last_sha1=last_sha1, per_page=per_page): exp = list(archive.lookup_expression(exp, last_sha1, per_page)) return exp if exp else None symbols = api_lookup( lookup_exp, q, notfound_msg="No indexed raw content match expression '{}'.".format(q), enrich_fn=functools.partial(utils.enrich_content, top_url=True), request=request, ) if symbols: nb_symbols = len(symbols) if nb_symbols == per_page: query_params = {} new_last_sha1 = symbols[-1]["sha1"] query_params["last_sha1"] = new_last_sha1 if request.query_params.get("per_page"): query_params["per_page"] = per_page result["headers"] = { "link-next": reverse( "api-1-content-symbol", url_args={"q": q}, query_params=query_params, request=request, ) } result.update({"results": symbols}) return result @api_route(r"/content/known/search/", "api-1-content-known", methods=["POST"]) @api_route(r"/content/known/(?P(?!search).+)/", "api-1-content-known") @api_doc("/content/known/", tags=["hidden"]) @format_docstring() def api_check_content_known(request: Request, q: Optional[str] = None): """ .. http:get:: /api/1/content/known/(sha1)[,(sha1), ...,(sha1)]/ Check whether some content(s) (aka "blob(s)") is present in the archive based on its **sha1** checksum. :param string sha1: hexadecimal representation of the **sha1** checksum value for the content to check existence. Multiple values can be provided separated by ','. {common_headers} :>json array search_res: array holding the search result for each provided **sha1** :>json object search_stats: some statistics regarding the number of **sha1** provided and the percentage of those found in the archive :statuscode 200: no error :statuscode 400: an invalid **sha1** has been provided **Example:** .. parsed-literal:: :swh_web_api:`content/known/dc2830a9e72f23c1dfebef4413003221baa5fb62,0c3f19cb47ebfbe643fb19fa94c874d18fa62d12/` """ search_stats = {"nbfiles": 0, "pct": 0} search_res = None queries = [] # GET: Many hash separated values request if q: hashes = q.split(",") for v in hashes: queries.append({"filename": None, "sha1": v}) # POST: Many hash requests in post form submission elif request.method == "POST": data = request.data # Remove potential inputs with no associated value for k, v in data.items(): if v is not None: if k == "q" and len(v) > 0: queries.append({"filename": None, "sha1": v}) elif v != "": queries.append({"filename": k, "sha1": v}) if queries: lookup = archive.lookup_multiple_hashes(queries) result = [] nb_queries = len(queries) for el in lookup: res_d = {"sha1": el["sha1"], "found": el["found"]} if "filename" in el and el["filename"]: res_d["filename"] = el["filename"] result.append(res_d) search_res = result nbfound = len([x for x in lookup if x["found"]]) search_stats["nbfiles"] = nb_queries search_stats["pct"] = int((nbfound / nb_queries) * 100) return {"search_res": search_res, "search_stats": search_stats} @api_route( r"/content/(?P[0-9a-z_:]*[0-9a-f]+)/", "api-1-content", checksum_args=["q"] ) @api_doc("/content/") @format_docstring() def api_content_metadata(request: Request, q: str): """ .. http:get:: /api/1/content/[(hash_type):](hash)/ Get information about a content (aka a "blob") object. In the archive, a content object is identified based on checksum values computed using various hashing algorithms. :param string hash_type: optional parameter specifying which hashing algorithm has been used to compute the content checksum. It can be either ``sha1``, ``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not provided, it is assumed that the hashing algorithm used is ``sha1``. :param string hash: hexadecimal representation of the checksum value computed with the specified hashing algorithm. {common_headers} :>json object checksums: object holding the computed checksum values for the requested content :>json string data_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/raw/` for downloading the content raw bytes :>json string filetype_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/filetype/` for getting information about the content MIME type :>json string language_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/language/` for getting information about the programming language used in the content :>json number length: length of the content in bytes :>json string license_url: link to :http:get:`/api/1/content/[(hash_type):](hash)/license/` for getting information about the license of the content :statuscode 200: no error :statuscode 400: an invalid **hash_type** or **hash** has been provided :statuscode 404: requested content can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`content/sha1_git:fe95a46679d128ff167b7c55df5d02356c5a1ae1/` """ return api_lookup( archive.lookup_content, q, notfound_msg="Content with {} not found.".format(q), enrich_fn=functools.partial(utils.enrich_content, query_string=q), request=request, ) diff --git a/swh/web/tests/api/views/test_content.py b/swh/web/tests/api/views/test_content.py index 68ea7398..af29714a 100644 --- a/swh/web/tests/api/views/test_content.py +++ b/swh/web/tests/api/views/test_content.py @@ -1,297 +1,339 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.web.common.utils import reverse from swh.web.tests.conftest import ctags_json_missing, fossology_missing from swh.web.tests.data import random_content from swh.web.tests.utils import ( check_api_get_responses, check_api_post_responses, check_http_get_response, ) def test_api_content_filetype(api_client, indexer_data, content): indexer_data.content_add_mimetype(content["sha1"]) url = reverse( "api-1-content-filetype", url_args={"q": "sha1_git:%s" % content["sha1_git"]} ) rv = check_api_get_responses(api_client, url, status_code=200) content_url = reverse( "api-1-content", url_args={"q": "sha1:%s" % content["sha1"]}, request=rv.wsgi_request, ) expected_data = indexer_data.content_get_mimetype(content["sha1"]) expected_data["content_url"] = content_url assert rv.data == expected_data def test_api_content_filetype_sha_not_found(api_client): unknown_content_ = random_content() url = reverse( "api-1-content-filetype", url_args={"q": "sha1:%s" % unknown_content_["sha1"]} ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "No filetype information found for content " "sha1:%s." % unknown_content_["sha1"], } def test_api_content_language_sha_not_found(api_client): unknown_content_ = random_content() url = reverse( "api-1-content-language", url_args={"q": "sha1:%s" % unknown_content_["sha1"]} ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "No language information found for content " "sha1:%s." % unknown_content_["sha1"], } @pytest.mark.skip # Language indexer is disabled @pytest.mark.skipif( ctags_json_missing, reason="requires ctags with json output support" ) def test_api_content_symbol(api_client, indexer_data, contents_with_ctags): expected_data = {} for content_sha1 in contents_with_ctags["sha1s"]: indexer_data.content_add_ctags(content_sha1) for ctag in indexer_data.content_get_ctags(content_sha1): if ctag["name"] == contents_with_ctags["symbol_name"]: expected_data[content_sha1] = ctag break url = reverse( "api-1-content-symbol", url_args={"q": contents_with_ctags["symbol_name"]}, query_params={"per_page": 100}, ) rv = check_api_get_responses(api_client, url, status_code=200) for entry in rv.data: content_sha1 = entry["sha1"] expected_entry = expected_data[content_sha1] for key, view_name in ( ("content_url", "api-1-content"), ("data_url", "api-1-content-raw"), ("license_url", "api-1-content-license"), ("language_url", "api-1-content-language"), ("filetype_url", "api-1-content-filetype"), ): expected_entry[key] = reverse( view_name, url_args={"q": "sha1:%s" % content_sha1}, request=rv.wsgi_request, ) expected_entry["sha1"] = content_sha1 del expected_entry["id"] assert entry == expected_entry assert "Link" not in rv url = reverse( "api-1-content-symbol", url_args={"q": contents_with_ctags["symbol_name"]}, query_params={"per_page": 2}, ) rv = check_api_get_responses(api_client, url, status_code=200) next_url = ( reverse( "api-1-content-symbol", url_args={"q": contents_with_ctags["symbol_name"]}, query_params={"last_sha1": rv.data[1]["sha1"], "per_page": 2}, request=rv.wsgi_request, ), ) assert rv["Link"] == '<%s>; rel="next"' % next_url def test_api_content_symbol_not_found(api_client): url = reverse("api-1-content-symbol", url_args={"q": "bar"}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "No indexed raw content match expression 'bar'.", } assert "Link" not in rv @pytest.mark.skipif( ctags_json_missing, reason="requires ctags with json output support" ) def test_api_content_ctags(api_client, indexer_data, content): indexer_data.content_add_ctags(content["sha1"]) url = reverse( "api-1-content-ctags", url_args={"q": "sha1_git:%s" % content["sha1_git"]} ) rv = check_api_get_responses(api_client, url, status_code=200) content_url = reverse( "api-1-content", url_args={"q": "sha1:%s" % content["sha1"]}, request=rv.wsgi_request, ) expected_data = list(indexer_data.content_get_ctags(content["sha1"])) for e in expected_data: e["content_url"] = content_url assert rv.data == expected_data @pytest.mark.skipif(fossology_missing, reason="requires fossology-nomossa installed") def test_api_content_license(api_client, indexer_data, content): indexer_data.content_add_license(content["sha1"]) url = reverse( "api-1-content-license", url_args={"q": "sha1_git:%s" % content["sha1_git"]} ) rv = check_api_get_responses(api_client, url, status_code=200) content_url = reverse( "api-1-content", url_args={"q": "sha1:%s" % content["sha1"]}, request=rv.wsgi_request, ) expected_data = list(indexer_data.content_get_license(content["sha1"])) for license in expected_data: del license["id"] assert rv.data == { "content_url": content_url, "id": content["sha1"], "facts": expected_data, } def test_api_content_license_sha_not_found(api_client): unknown_content_ = random_content() url = reverse( "api-1-content-license", url_args={"q": "sha1:%s" % unknown_content_["sha1"]} ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "No license information found for content " "sha1:%s." % unknown_content_["sha1"], } def test_api_content_metadata(api_client, archive_data, content): url = reverse("api-1-content", {"q": "sha1:%s" % content["sha1"]}) rv = check_api_get_responses(api_client, url, status_code=200) expected_data = archive_data.content_get(content["sha1"]) for key, view_name in ( ("data_url", "api-1-content-raw"), ("license_url", "api-1-content-license"), ("language_url", "api-1-content-language"), ("filetype_url", "api-1-content-filetype"), ): expected_data[key] = reverse( view_name, url_args={"q": "sha1:%s" % content["sha1"]}, request=rv.wsgi_request, ) assert rv.data == expected_data def test_api_content_not_found(api_client): unknown_content_ = random_content() url = reverse("api-1-content", url_args={"q": "sha1:%s" % unknown_content_["sha1"]}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Content with sha1 checksum equals to %s not found!" % unknown_content_["sha1"], } def test_api_content_raw_ko_not_found(api_client): unknown_content_ = random_content() url = reverse( "api-1-content-raw", url_args={"q": "sha1:%s" % unknown_content_["sha1"]} ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": "Content with sha1 checksum equals to %s not found!" % unknown_content_["sha1"], } def test_api_content_raw_text(api_client, archive_data, content): url = reverse("api-1-content-raw", url_args={"q": "sha1:%s" % content["sha1"]}) rv = check_http_get_response(api_client, url, status_code=200) assert rv["Content-Type"] == "application/octet-stream" assert ( rv["Content-disposition"] - == "attachment; filename=content_sha1_%s_raw" % content["sha1"] + == 'attachment; filename="content_sha1_%s_raw"' % content["sha1"] ) expected_data = archive_data.content_get_data(content["sha1"]) - assert rv.content == expected_data["data"] + assert b"".join(rv.streaming_content) == expected_data["data"] + assert int(rv["Content-Length"]) == len(expected_data["data"]) def test_api_content_raw_text_with_filename(api_client, archive_data, content): url = reverse( "api-1-content-raw", url_args={"q": "sha1:%s" % content["sha1"]}, query_params={"filename": "filename.txt"}, ) rv = check_http_get_response(api_client, url, status_code=200) - assert rv["Content-disposition"] == "attachment; filename=filename.txt" + assert rv["Content-disposition"] == 'attachment; filename="filename.txt"' assert rv["Content-Type"] == "application/octet-stream" expected_data = archive_data.content_get_data(content["sha1"]) - assert rv.content == expected_data["data"] + assert b"".join(rv.streaming_content) == expected_data["data"] + assert int(rv["Content-Length"]) == len(expected_data["data"]) + + +@pytest.mark.parametrize( + "encoded,expected", + [ + # From https://datatracker.ietf.org/doc/html/rfc5987#section-3.2.2 + ( + "%c2%a3%20and%20%e2%82%ac%20rates.txt", + "%C2%A3%20and%20%E2%82%AC%20rates.txt", + ), + ("%A3%20rates.txt", "%EF%BF%BD%20rates.txt"), + # found in the wild + ( + "Th%C3%A9orie%20de%20sant%C3%A9-aide-justice.pdf", + "Th%C3%A9orie%20de%20sant%C3%A9-aide-justice.pdf", + ), + ], +) +def test_api_content_raw_text_with_nonascii_filename( + api_client, archive_data, content, encoded, expected +): + url = reverse( + "api-1-content-raw", + url_args={"q": "sha1:%s" % content["sha1"]}, + ) + rv = check_http_get_response( + api_client, f"{url}?filename={encoded}", status_code=200 + ) + + # technically, ISO8859-1 is allowed too + assert rv["Content-disposition"].isascii(), rv["Content-disposition"] + + assert rv["Content-disposition"] == ( + f"attachment; filename*=utf-8''{expected}" + ), rv["Content-disposition"] + + assert rv["Content-Type"] == "application/octet-stream" + expected_data = archive_data.content_get_data(content["sha1"]) + assert b"".join(rv.streaming_content) == expected_data["data"] + assert int(rv["Content-Length"]) == len(expected_data["data"]) def test_api_check_content_known(api_client, content): url = reverse("api-1-content-known", url_args={"q": content["sha1"]}) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "search_res": [{"found": True, "sha1": content["sha1"]}], "search_stats": {"nbfiles": 1, "pct": 100.0}, } def test_api_check_content_known_post(api_client, content): url = reverse("api-1-content-known") rv = check_api_post_responses( api_client, url, data={"q": content["sha1"]}, status_code=200 ) assert rv.data == { "search_res": [{"found": True, "sha1": content["sha1"]}], "search_stats": {"nbfiles": 1, "pct": 100.0}, } def test_api_check_content_known_not_found(api_client): unknown_content_ = random_content() url = reverse("api-1-content-known", url_args={"q": unknown_content_["sha1"]}) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "search_res": [{"found": False, "sha1": unknown_content_["sha1"]}], "search_stats": {"nbfiles": 1, "pct": 0.0}, } def test_api_content_uppercase(api_client, content): url = reverse( "api-1-content-uppercase-checksum", url_args={"q": content["sha1"].upper()} ) rv = check_http_get_response(api_client, url, status_code=302) redirect_url = reverse("api-1-content", url_args={"q": content["sha1"]}) assert rv["location"] == redirect_url