diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py index 2e908a34..bbf7f096 100644 --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -1,638 +1,639 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.util import strtobool from functools import partial from swh.web.common import service from swh.web.common.exc import BadInputExc from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import reverse from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup DOC_RETURN_ORIGIN = ''' :>json string origin_visits_url: link to in order to get information about the visits for that origin :>json string url: the origin canonical url :>json string type: the type of software origin (deprecated value; types are now associated to visits instead of origins) :>json number id: the origin unique identifier (deprecated value; you should only refer to origins based on their URL) ''' DOC_RETURN_ORIGIN_ARRAY = \ DOC_RETURN_ORIGIN.replace(':>json', ':>jsonarr') DOC_RETURN_ORIGIN_VISIT = ''' :>json string date: ISO representation of the visit date (in UTC) :>json str origin: the origin canonical url :>json string origin_url: link to get information about the origin :>jsonarr string snapshot: the snapshot identifier of the visit :>jsonarr string snapshot_url: link to :http:get:`/api/1/snapshot/(snapshot_id)/` in order to get information about the snapshot of the visit :>json string status: status of the visit (either **full**, **partial** or **ongoing**) :>json number visit: the unique identifier of the visit ''' DOC_RETURN_ORIGIN_VISIT_ARRAY = \ DOC_RETURN_ORIGIN_VISIT.replace(':>json', ':>jsonarr') DOC_RETURN_ORIGIN_VISIT_ARRAY += ''' :>jsonarr number id: the unique identifier of the origin :>jsonarr string origin_visit_url: link to :http:get:`/api/1/origin/(origin_url)/visit/(visit_id)/` in order to get information about the visit ''' def _enrich_origin(origin): if 'url' in origin: o = origin.copy() o['origin_visits_url'] = reverse( 'api-1-origin-visits', url_args={'origin_url': origin['url']}) return o return origin def _enrich_origin_visit(origin_visit, *, with_origin_link, with_origin_visit_link): ov = origin_visit.copy() if with_origin_link: ov['origin_url'] = reverse('api-1-origin', url_args={'origin_url': ov['origin']}) if with_origin_visit_link: ov['origin_visit_url'] = reverse('api-1-origin-visit', url_args={'origin_url': ov['origin'], 'visit_id': ov['visit']}) snapshot = ov['snapshot'] if snapshot: ov['snapshot_url'] = reverse('api-1-snapshot', url_args={'snapshot_id': snapshot}) else: ov['snapshot_url'] = None return ov @api_route(r'/origins/', 'api-1-origins') @api_doc('/origins/', noargs=True) @format_docstring(return_origin_array=DOC_RETURN_ORIGIN_ARRAY) def api_origins(request): """ .. http:get:: /api/1/origins/ Get list of archived software origins. Origins are sorted by ids before returning them. :query int origin_from: The first origin id that will be included in returned results (default to 1) :query int origin_count: The maximum number of origins to return (default to 100, can not exceed 10000) {return_origin_array} {common_headers} {resheader_link} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`origins?origin_from=50000&origin_count=500` """ origin_from = int(request.query_params.get('origin_from', '1')) origin_count = int(request.query_params.get('origin_count', '100')) origin_count = min(origin_count, 10000) results = api_lookup( service.lookup_origins, origin_from, origin_count+1, enrich_fn=_enrich_origin) response = {'results': results, 'headers': {}} if len(results) > origin_count: origin_from = results.pop()['id'] response['headers']['link-next'] = reverse( 'api-1-origins', query_params={'origin_from': origin_from, 'origin_count': origin_count}) return response @api_route(r'/origin/(?P[a-z]+)/url/(?P.+)/', 'api-1-origin') @api_route(r'/origin/(?P.+)/get/', 'api-1-origin') @api_route(r'/origin/(?P[0-9]+)/', 'api-1-origin') @api_doc('/origin/') @format_docstring(return_origin=DOC_RETURN_ORIGIN) def api_origin(request, origin_id=None, origin_type=None, origin_url=None): """ .. http:get:: /api/1/origin/(origin_url)/get/ Get information about a software origin. :param string origin_url: the origin url {return_origin} {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/git/url/https://github.com/python/cpython/` .. http:get:: /api/1/origin/(origin_id)/ Get information about a software origin. .. warning:: All endpoints using an ``origin_id`` or an ``origin_type`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. You should use :http:get:`/api/1/origin/(origin_url)/get/` instead. :param int origin_id: a software origin identifier {return_origin} {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/1/` .. http:get:: /api/1/origin/(origin_type)/url/(origin_url)/ Get information about a software origin. .. warning:: All endpoints using an ``origin_id`` or an ``origin_type`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. You should use :http:get:`/api/1/origin/(origin_url)/get/` instead. :param string origin_type: the origin type (possible values are ``git``, ``svn``, ``hg``, ``deb``, ``pypi``, ``npm``, ``ftp`` or ``deposit``) :param string origin_url: the origin url {return_origin} {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/git/url/https://github.com/python/cpython/` """ ori_dict = { 'id': int(origin_id) if origin_id else None, 'type': origin_type, 'url': origin_url } ori_dict = {k: v for k, v in ori_dict.items() if ori_dict[k]} error_msg = 'Origin %s not found.' % \ (ori_dict.get('id') or ori_dict['url']) return api_lookup( service.lookup_origin, ori_dict, notfound_msg=error_msg, enrich_fn=_enrich_origin) @api_route(r'/origin/search/(?P.+)/', 'api-1-origin-search') @api_doc('/origin/search/') @format_docstring(return_origin_array=DOC_RETURN_ORIGIN_ARRAY) def api_origin_search(request, url_pattern): """ .. http:get:: /api/1/origin/search/(url_pattern)/ Search for software origins whose urls contain a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. :param string url_pattern: a string pattern or a regular expression :query int offset: the number of found origins to skip before returning results :query int limit: the maximum number of found origins to return :query boolean regexp: if true, consider provided pattern as a regular expression and search origins whose urls match it :query boolean with_visit: if true, only return origins with at least one visit by Software heritage {return_origin_array} {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`origin/search/python/?limit=2` """ result = {} offset = int(request.query_params.get('offset', '0')) limit = int(request.query_params.get('limit', '70')) regexp = request.query_params.get('regexp', 'false') with_visit = request.query_params.get('with_visit', 'false') results = api_lookup(service.search_origin, url_pattern, offset, limit, bool(strtobool(regexp)), bool(strtobool(with_visit)), enrich_fn=_enrich_origin) nb_results = len(results) if nb_results == limit: query_params = {} query_params['offset'] = offset + limit query_params['limit'] = limit query_params['regexp'] = regexp result['headers'] = { 'link-next': reverse('api-1-origin-search', url_args={'url_pattern': url_pattern}, query_params=query_params) } result.update({ 'results': results }) return result @api_route(r'/origin/metadata-search/', 'api-1-origin-metadata-search') @api_doc('/origin/metadata-search/', noargs=True, need_params=True) @format_docstring(return_origin_array=DOC_RETURN_ORIGIN_ARRAY) def api_origin_metadata_search(request): """ .. http:get:: /api/1/origin/metadata-search/ Search for software origins whose metadata (expressed as a JSON-LD/CodeMeta dictionary) match the provided criteria. For now, only full-text search on this dictionary is supported. :query str fulltext: a string that will be matched against origin metadata; results are ranked and ordered starting with the best ones. :query int limit: the maximum number of found origins to return (bounded to 100) {return_origin_array} {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`origin/metadata-search/?limit=2&fulltext=Jane%20Doe` """ fulltext = request.query_params.get('fulltext', None) limit = min(int(request.query_params.get('limit', '70')), 100) if not fulltext: content = '"fulltext" must be provided and non-empty.' raise BadInputExc(content) results = api_lookup(service.search_origin_metadata, fulltext, limit) return { 'results': results, } @api_route(r'/origin/(?P.*)/visits/', 'api-1-origin-visits') @api_route(r'/origin/(?P[0-9]+)/visits/', 'api-1-origin-visits') @api_doc('/origin/visits/') @format_docstring( return_origin_visit_array=DOC_RETURN_ORIGIN_VISIT_ARRAY) def api_origin_visits(request, origin_id=None, origin_url=None): """ .. http:get:: /api/1/origin/(origin_url)/visits/ Get information about all visits of a software origin. Visits are returned sorted in descending order according to their date. :param str origin_url: a software origin URL :query int per_page: specify the number of visits to list, for pagination purposes :query int last_visit: visit to start listing from, for pagination purposes {common_headers} {resheader_link} {return_origin_visit_array} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/hylang/hy/visits/` .. http:get:: /api/1/origin/(origin_id)/visits/ Get information about all visits of a software origin. Visits are returned sorted in descending order according to their date. .. warning:: All endpoints using an ``origin_id`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. Use :http:get:`/api/1/origin/(origin_url)/visits/` instead. :param int origin_id: a software origin identifier :query int per_page: specify the number of visits to list, for pagination purposes :query int last_visit: visit to start listing from, for pagination purposes {common_headers} {resheader_link} {return_origin_visit_array} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/1/visits/` """ result = {} if origin_url: origin_query = {'url': origin_url} notfound_msg = 'No origin {} found'.format(origin_url) url_args_next = {'origin_url': origin_url} else: origin_query = {'id': int(origin_id)} notfound_msg = 'No origin {} found'.format(origin_id) url_args_next = {'origin_id': origin_id} per_page = int(request.query_params.get('per_page', '10')) last_visit = request.query_params.get('last_visit') if last_visit: last_visit = int(last_visit) def _lookup_origin_visits( origin_query, last_visit=last_visit, per_page=per_page): all_visits = get_origin_visits(origin_query) all_visits.reverse() visits = [] if not last_visit: visits = all_visits[:per_page] else: for i, v in enumerate(all_visits): if v['visit'] == last_visit: visits = all_visits[i+1:i+1+per_page] break for v in visits: yield v results = api_lookup(_lookup_origin_visits, origin_query, notfound_msg=notfound_msg, enrich_fn=partial(_enrich_origin_visit, with_origin_link=False, with_origin_visit_link=True)) if results: nb_results = len(results) if nb_results == per_page: new_last_visit = results[-1]['visit'] query_params = {} query_params['last_visit'] = new_last_visit if request.query_params.get('per_page'): query_params['per_page'] = per_page result['headers'] = { 'link-next': reverse('api-1-origin-visits', url_args=url_args_next, query_params=query_params) } result.update({ 'results': results }) return result @api_route(r'/origin/(?P.*)/visit/latest/', 'api-1-origin-visit-latest', throttle_scope='swh_api_origin_visit_latest') @api_doc('/origin/visit/') @format_docstring(return_origin_visit=DOC_RETURN_ORIGIN_VISIT) def api_origin_visit_latest(request, origin_url=None): """ .. http:get:: /api/1/origin/(origin_url)/visit/latest/ Get information about a specific visit of a software origin. :param str origin_url: a software origin URL :query boolean require_snapshot: if true, only return a visit with a snapshot {common_headers} {return_origin_visit} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin or visit can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/hylang/hy/visit/latest/` """ require_snapshot = request.query_params.get('require_snapshot', 'false') return api_lookup( service.lookup_origin_visit_latest, origin_url, bool(strtobool(require_snapshot)), notfound_msg=('No visit for origin {} found' .format(origin_url)), enrich_fn=partial(_enrich_origin_visit, with_origin_link=True, with_origin_visit_link=False)) @api_route(r'/origin/(?P.*)/visit/(?P[0-9]+)/', 'api-1-origin-visit') @api_route(r'/origin/(?P[0-9]+)/visit/(?P[0-9]+)/', 'api-1-origin-visit') @api_doc('/origin/visit/') @format_docstring(return_origin_visit=DOC_RETURN_ORIGIN_VISIT) def api_origin_visit(request, visit_id, origin_url=None, origin_id=None): """ .. http:get:: /api/1/origin/(origin_url)/visit/(visit_id)/ Get information about a specific visit of a software origin. :param str origin_url: a software origin URL :param int visit_id: a visit identifier {common_headers} {return_origin_visit} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin or visit can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/https://github.com/hylang/hy/visit/1/` .. http:get:: /api/1/origin/(origin_id)/visit/(visit_id)/ Get information about a specific visit of a software origin. .. warning:: All endpoints using an ``origin_id`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. Use :http:get:`/api/1/origin/(origin_url)/visit/(visit_id)` instead. :param int origin_id: a software origin identifier :param int visit_id: a visit identifier {common_headers} {return_origin_visit} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin or visit can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/1500/visit/1/` """ if not origin_url: origin_url = service.lookup_origin({'id': int(origin_id)})['url'] return api_lookup( service.lookup_origin_visit, origin_url, int(visit_id), notfound_msg=('No visit {} for origin {} found' .format(visit_id, origin_url)), enrich_fn=partial(_enrich_origin_visit, with_origin_link=True, with_origin_visit_link=False)) @api_route(r'/origin/(?P[a-z]+)/url/(?P.+)' '/intrinsic-metadata', 'api-origin-intrinsic-metadata') @api_doc('/origin/intrinsic-metadata/') @format_docstring() def api_origin_intrinsic_metadata(request, origin_type, origin_url): """ .. http:get:: /api/1/origin/(origin_type)/url/(origin_url)/intrinsic-metadata Get intrinsic metadata of a software origin (as a JSON-LD/CodeMeta dictionary). :param string origin_type: the origin type (possible values are ``git``, ``svn``, ``hg``, ``deb``, ``pypi``, ``npm``, ``ftp`` or ``deposit``) :param string origin_url: the origin url :>json string ???: intrinsic metadata field of the origin {common_headers} - **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` + **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, + :http:method:`options` :statuscode 200: no error :statuscode 404: requested origin can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`origin/git/url/https://github.com/python/cpython/intrinsic-metadata` """ # noqa ori_dict = { 'type': origin_type, 'url': origin_url } error_msg = 'Origin with URL %s not found' % ori_dict['url'] return api_lookup( service.lookup_origin_intrinsic_metadata, ori_dict, notfound_msg=error_msg, enrich_fn=_enrich_origin) diff --git a/swh/web/api/views/origin_save.py b/swh/web/api/views/origin_save.py index 34e5ea05..6ac43fe1 100644 --- a/swh/web/api/views/origin_save.py +++ b/swh/web/api/views/origin_save.py @@ -1,87 +1,87 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.views.decorators.cache import never_cache from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.common.origin_save import ( create_save_origin_request, get_save_origin_requests ) @api_route(r'/origin/save/(?P.+)/url/(?P.+)/', 'api-1-save-origin', methods=['GET', 'POST'], throttle_scope='swh_save_origin') @never_cache @api_doc('/origin/save/') @format_docstring() def api_save_origin(request, origin_type, origin_url): """ .. http:get:: /api/1/origin/save/(origin_type)/url/(origin_url)/ .. http:post:: /api/1/origin/save/(origin_type)/url/(origin_url)/ Request the saving of a software origin into the archive or check the status of previously created save requests. That endpoint enables to create a saving task for a software origin through a POST request. Depending of the provided origin url, the save request can either be: * immediately **accepted**, for well known code hosting providers like for instance GitHub or GitLab * **rejected**, in case the url is blacklisted by Software Heritage * **put in pending state** until a manual check is done in order to determine if it can be loaded or not Once a saving request has been accepted, its associated saving task status can then be checked through a GET request on the same url. Returned status can either be: * **not created**: no saving task has been created * **not yet scheduled**: saving task has been created but its - execution has not yet been scheduled + execution has not yet been scheduled * **scheduled**: the task execution has been scheduled * **succeed**: the saving task has been successfully executed * **failed**: the saving task has been executed but it failed When issuing a POST request an object will be returned while a GET request will return an array of objects (as multiple save requests might have been submitted for the same origin). :param string origin_type: the type of origin to save (currently the supported types are ``git``, ``hg`` and ``svn``) :param string origin_url: the url of the origin to save {common_headers} :>json string origin_url: the url of the origin to save :>json string origin_type: the type of the origin to save :>json string save_request_date: the date (in iso format) the save request was issued :>json string save_request_status: the status of the save request, either **accepted**, **rejected** or **pending** :>json string save_task_status: the status of the origin saving task, either **not created**, **not yet scheduled**, **scheduled**, **succeed** or **failed** **Allowed HTTP Methods:** :http:method:`get`, :http:method:`post`, - :http:method:`head`, :http:method:`options` + :http:method:`head`, :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid origin type or url has been provided :statuscode 403: the provided origin url is blacklisted :statuscode 404: no save requests have been found for a given origin """ if request.method == 'POST': sor = create_save_origin_request(origin_type, origin_url) del sor['id'] else: sor = get_save_origin_requests(origin_type, origin_url) for s in sor: del s['id'] # noqa return sor diff --git a/swh/web/api/views/release.py b/swh/web/api/views/release.py index b43ecdc3..ca765fb7 100644 --- a/swh/web/api/views/release.py +++ b/swh/web/api/views/release.py @@ -1,59 +1,59 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.web.common import service from swh.web.api import utils from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup @api_route(r'/release/(?P[0-9a-f]+)/', 'api-1-release', checksum_args=['sha1_git']) @api_doc('/release/') @format_docstring() def api_release(request, sha1_git): """ .. http:get:: /api/1/release/(sha1_git)/ Get information about a release in the archive. Releases are identified by **sha1** checksums, compatible with Git tag identifiers. See :func:`swh.model.identifiers.release_identifier` in our data model module for details about how they are computed. :param string sha1_git: hexadecimal representation of the release **sha1_git** identifier {common_headers} :>json object author: information about the author of the release :>json string date: ISO representation of the release date (in UTC) :>json string id: the release unique identifier :>json string message: the message associated to the release :>json string name: the name of the release :>json string target: the target identifier of the release :>json string target_type: the type of the target, can be either **release**, **revision**, **content**, **directory** :>json string target_url: a link to the adequate api url based on the target type **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: requested release can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`release/208f61cc7a5dbc9879ae6e5c2f95891e270f09ef/` """ error_msg = 'Release with sha1_git %s not found.' % sha1_git return api_lookup( service.lookup_release, sha1_git, notfound_msg=error_msg, enrich_fn=utils.enrich_release) diff --git a/swh/web/api/views/revision.py b/swh/web/api/views/revision.py index e7f676b9..bb9aab9f 100644 --- a/swh/web/api/views/revision.py +++ b/swh/web/api/views/revision.py @@ -1,473 +1,477 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.http import HttpResponse from swh.web.common import service from swh.web.common.utils import reverse from swh.web.common.utils import parse_timestamp from swh.web.api import utils from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup DOC_RETURN_REVISION = ''' :>json object author: information about the author of the revision :>json object committer: information about the committer of the revision :>json string committer_date: ISO representation of the commit date (in UTC) :>json string date: ISO representation of the revision date (in UTC) :>json string directory: the unique identifier that revision points to :>json string directory_url: link to :http:get:`/api/1/directory/(sha1_git)/[(path)/]` to get information about the directory associated to the revision :>json string id: the revision unique identifier :>json boolean merge: whether or not the revision corresponds to a merge commit :>json string message: the message associated to the revision :>json array parents: the parents of the revision, i.e. the previous revisions that head directly to it, each entry of that array contains an unique parent revision identifier but also a link to :http:get:`/api/1/revision/(sha1_git)/` to get more information about it :>json string type: the type of the revision ''' # noqa DOC_RETURN_REVISION_ARRAY = \ DOC_RETURN_REVISION.replace(':>json', ':>jsonarr') def _revision_directory_by(revision, path, request_path, limit=100, with_data=False): """ Compute the revision matching criterion's directory or content data. Args: revision: dictionary of criterions representing a revision to lookup path: directory's path to lookup request_path: request path which holds the original context to limit: optional query parameter to limit the revisions log (default to 100). For now, note that this limit could impede the transitivity conclusion about sha1_git not being an ancestor of with_data: indicate to retrieve the content's raw data if path resolves to a content. """ def enrich_directory_local(dir, context_url=request_path): return utils.enrich_directory(dir, context_url) rev_id, result = service.lookup_directory_through_revision( revision, path, limit=limit, with_data=with_data) content = result['content'] if result['type'] == 'dir': # dir_entries result['content'] = list(map(enrich_directory_local, content)) elif result['type'] == 'file': # content result['content'] = utils.enrich_content(content) elif result['type'] == 'rev': # revision result['content'] = utils.enrich_revision(content) return result @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/log/', 'api-1-revision-origin-log') @api_route(r'/revision/origin/(?P[0-9]+)/log/', 'api-1-revision-origin-log') @api_route(r'/revision/origin/(?P[0-9]+)' r'/ts/(?P.+)/log/', 'api-1-revision-origin-log') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)' r'/ts/(?P.+)/log/', 'api-1-revision-origin-log') @api_doc('/revision/origin/log/') @format_docstring(return_revision_array=DOC_RETURN_REVISION_ARRAY) def api_revision_log_by(request, origin_id, branch_name='HEAD', ts=None): """ .. http:get:: /api/1/revision/origin/(origin_id)[/branch/(branch_name)][/ts/(timestamp)]/log Show the commit log for a revision, searching for it based on software origin, branch name, and/or visit timestamp. This endpoint behaves like :http:get:`/api/1/revision/(sha1_git)[/prev/(prev_sha1s)]/log/`, but operates on the revision that has been found at a given software origin, close to a given point in time, pointed by a given branch. .. warning:: All endpoints using an ``origin_id`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. You should instead use successively :http:get:`/api/1/origin/(origin_url)/visits/`, :http:get:`/api/1/snapshot/(snapshot_id)/`, and :http:get:`/api/1/revision/(sha1_git)[/prev/(prev_sha1s)]/log/`. :param int origin_id: a software origin identifier :param string branch_name: optional parameter specifying a fully-qualified branch name associated to the software origin, e.g., "refs/heads/master". Defaults to the HEAD branch. :param string timestamp: optional parameter specifying a timestamp close to which the revision pointed by the given branch should be looked up. The timestamp can be expressed either as an ISO date or as a Unix one (in UTC). Defaults to now. {common_headers} {return_revision_array} - **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` + **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, + :http:method:`options` :statuscode 200: no error :statuscode 404: no revision matching the given criteria could be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/origin/723566/ts/2016-01-17T00:00:00+00:00/log/` """ # noqa result = {} per_page = int(request.query_params.get('per_page', '10')) def lookup_revision_log_by_with_limit(o_id, br, ts, limit=per_page+1): return service.lookup_revision_log_by(o_id, br, ts, limit) error_msg = 'No revision matching origin %s ' % origin_id error_msg += ', branch name %s' % branch_name error_msg += (' and time stamp %s.' % ts) if ts else '.' rev_get = api_lookup( lookup_revision_log_by_with_limit, int(origin_id), branch_name, ts, notfound_msg=error_msg, enrich_fn=utils.enrich_revision) nb_rev = len(rev_get) if nb_rev == per_page+1: revisions = rev_get[:-1] last_sha1_git = rev_get[-1]['id'] params = {k: v for k, v in {'origin_id': origin_id, 'branch_name': branch_name, 'ts': ts, }.items() if v is not None} query_params = {} query_params['sha1_git'] = last_sha1_git if request.query_params.get('per_page'): query_params['per_page'] = per_page result['headers'] = { 'link-next': reverse('api-1-revision-origin-log', url_args=params, query_params=query_params) } else: revisions = rev_get result.update({'results': revisions}) return result @api_route(r'/revision/origin/(?P[0-9]+)/directory/', 'api-1-revision-origin-directory') @api_route(r'/revision/origin/(?P[0-9]+)/directory/(?P.+)/', 'api-1-revision-origin-directory') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/directory/', 'api-1-revision-origin-directory') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/ts/(?P.+)/directory/', 'api-1-revision-origin-directory') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/directory/(?P.+)/', 'api-1-revision-origin-directory') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/ts/(?P.+)' r'/directory/(?P.+)/', 'api-1-revision-origin-directory') @api_doc('/revision/origin/directory/', tags=['hidden']) def api_directory_through_revision_origin(request, origin_id, branch_name='HEAD', ts=None, path=None, with_data=False): """ Display directory or content information through a revision identified by origin/branch/timestamp. .. warning:: All endpoints using an ``origin_id`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. You should instead use successively :http:get:`/api/1/origin/(origin_url)/visits/`, :http:get:`/api/1/snapshot/(snapshot_id)/`, :http:get:`/api/1/revision/(sha1_git)/`, :http:get:`/api/1/directory/(sha1_git)/[(path)/]` """ if ts: ts = parse_timestamp(ts) return _revision_directory_by({'origin_id': int(origin_id), 'branch_name': branch_name, 'ts': ts }, path, request.path, with_data=with_data) @api_route(r'/revision/origin/(?P[0-9]+)/', 'api-1-revision-origin') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/', 'api-1-revision-origin') @api_route(r'/revision/origin/(?P[0-9]+)' r'/branch/(?P.+)/ts/(?P.+)/', 'api-1-revision-origin') @api_route(r'/revision/origin/(?P[0-9]+)/ts/(?P.+)/', 'api-1-revision-origin') @api_doc('/revision/origin/') @format_docstring(return_revision=DOC_RETURN_REVISION) def api_revision_with_origin(request, origin_id, branch_name='HEAD', ts=None): """ .. http:get:: /api/1/revision/origin/(origin_id)/[branch/(branch_name)/][ts/(timestamp)/] Get information about a revision, searching for it based on software origin, branch name, and/or visit timestamp. This endpoint behaves like :http:get:`/api/1/revision/(sha1_git)/`, but operates on the revision that has been found at a given software origin, close to a given point in time, pointed by a given branch. .. warning:: All endpoints using an ``origin_id`` are deprecated and will be removed in the near future. Only those using an ``origin_url`` will remain available. You should instead use successively :http:get:`/api/1/origin/(origin_url)/visits/`, :http:get:`/api/1/snapshot/(snapshot_id)/`, and :http:get:`/api/1/revision/(sha1_git)/`. :param int origin_id: a software origin identifier :param string branch_name: optional parameter specifying a fully-qualified branch name associated to the software origin, e.g., "refs/heads/master". Defaults to the HEAD branch. :param string timestamp: optional parameter specifying a timestamp close to which the revision pointed by the given branch should be looked up. The timestamp can be expressed either as an ISO date or as a Unix one (in UTC). Defaults to now. {common_headers} {return_revision} - **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` + **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, + :http:method:`options` :statuscode 200: no error :statuscode 404: no revision matching the given criteria could be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/origin/13706355/branch/refs/heads/2.7/` """ # noqa return api_lookup( service.lookup_revision_by, int(origin_id), branch_name, ts, notfound_msg=('Revision with (origin_id: {}, branch_name: {}' ', ts: {}) not found.'.format(origin_id, branch_name, ts)), enrich_fn=utils.enrich_revision) @api_route(r'/revision/(?P[0-9a-f]+)/', 'api-1-revision', checksum_args=['sha1_git']) @api_doc('/revision/') @format_docstring(return_revision=DOC_RETURN_REVISION) def api_revision(request, sha1_git): """ .. http:get:: /api/1/revision/(sha1_git)/ Get information about a revision in the archive. Revisions are identified by **sha1** checksums, compatible with Git commit identifiers. See :func:`swh.model.identifiers.revision_identifier` in our data model module for details about how they are computed. :param string sha1_git: hexadecimal representation of the revision **sha1_git** identifier {common_headers} {return_revision} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: requested revision can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/` """ # noqa return api_lookup( service.lookup_revision, sha1_git, notfound_msg='Revision with sha1_git {} not found.'.format(sha1_git), enrich_fn=utils.enrich_revision) @api_route(r'/revision/(?P[0-9a-f]+)/raw/', 'api-1-revision-raw-message', checksum_args=['sha1_git']) @api_doc('/revision/raw/', tags=['hidden'], handle_response=True) def api_revision_raw_message(request, sha1_git): """Return the raw data of the message of revision identified by sha1_git """ raw = service.lookup_revision_message(sha1_git) response = HttpResponse(raw['message'], content_type='application/octet-stream') response['Content-disposition'] = \ 'attachment;filename=rev_%s_raw' % sha1_git return response @api_route(r'/revision/(?P[0-9a-f]+)/directory/', 'api-1-revision-directory', checksum_args=['sha1_git']) @api_route(r'/revision/(?P[0-9a-f]+)/directory/(?P.+)/', 'api-1-revision-directory', checksum_args=['sha1_git']) @api_doc('/revision/directory/') @format_docstring() def api_revision_directory(request, sha1_git, dir_path=None, with_data=False): """ .. http:get:: /api/1/revision/(sha1_git)/directory/[(path)/] Get information about directory (entry) objects associated to revisions. Each revision is associated to a single "root" directory. This endpoint behaves like :http:get:`/api/1/directory/(sha1_git)/[(path)/]`, but operates on the root directory associated to a given revision. :param string sha1_git: hexadecimal representation of the revision **sha1_git** identifier :param string path: optional parameter to get information about the directory entry pointed by that relative path {common_headers} :>json array content: directory entries as returned by :http:get:`/api/1/directory/(sha1_git)/[(path)/]` :>json string path: path of directory from the revision root one :>json string revision: the unique revision identifier :>json string type: the type of the directory - **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` + **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: requested revision can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/f1b94134a4b879bc55c3dacdb496690c8ebdc03f/directory/` """ # noqa return _revision_directory_by({'sha1_git': sha1_git}, dir_path, request.path, with_data=with_data) @api_route(r'/revision/(?P[0-9a-f]+)/log/', 'api-1-revision-log', checksum_args=['sha1_git']) @api_route(r'/revision/(?P[0-9a-f]+)' r'/prev/(?P[0-9a-f]*/*)/log/', 'api-1-revision-log', checksum_args=['sha1_git', 'prev_sha1s']) @api_doc('/revision/log/') @format_docstring(return_revision_array=DOC_RETURN_REVISION_ARRAY) def api_revision_log(request, sha1_git, prev_sha1s=None): """ .. http:get:: /api/1/revision/(sha1_git)[/prev/(prev_sha1s)]/log/ Get a list of all revisions heading to a given one, in other words show the commit log. :param string sha1_git: hexadecimal representation of the revision **sha1_git** identifier :param string prev_sha1s: optional parameter representing the navigation breadcrumbs (descendant revisions previously visited). If multiple values, use / as delimiter. If provided, revisions information will be added at the beginning of the returned list. :query int per_page: number of elements in the returned list, for pagination purpose {common_headers} {resheader_link} {return_revision_array} - **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` + **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid **sha1_git** value has been provided :statuscode 404: requested revision can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`revision/e1a315fa3fa734e2a6154ed7b5b9ae0eb8987aad/log/` """ # noqa result = {} per_page = int(request.query_params.get('per_page', '10')) def lookup_revision_log_with_limit(s, limit=per_page+1): return service.lookup_revision_log(s, limit) error_msg = 'Revision with sha1_git %s not found.' % sha1_git rev_get = api_lookup(lookup_revision_log_with_limit, sha1_git, notfound_msg=error_msg, enrich_fn=utils.enrich_revision) nb_rev = len(rev_get) if nb_rev == per_page+1: rev_backward = rev_get[:-1] new_last_sha1 = rev_get[-1]['id'] query_params = {} if request.query_params.get('per_page'): query_params['per_page'] = per_page result['headers'] = { 'link-next': reverse('api-1-revision-log', url_args={'sha1_git': new_last_sha1}, query_params=query_params) } else: rev_backward = rev_get if not prev_sha1s: # no nav breadcrumbs, so we're done revisions = rev_backward else: rev_forward_ids = prev_sha1s.split('/') rev_forward = api_lookup( service.lookup_revision_multiple, rev_forward_ids, notfound_msg=error_msg, enrich_fn=utils.enrich_revision) revisions = rev_forward + rev_backward result.update({ 'results': revisions }) return result diff --git a/swh/web/api/views/snapshot.py b/swh/web/api/views/snapshot.py index fcf85e21..5a88d710 100644 --- a/swh/web/api/views/snapshot.py +++ b/swh/web/api/views/snapshot.py @@ -1,119 +1,119 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.web.common import service from swh.web.common.utils import reverse from swh.web.config import get_config from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api import utils from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup @api_route(r'/snapshot/(?P[0-9a-f]+)/', 'api-1-snapshot', checksum_args=['snapshot_id']) @api_doc('/snapshot/') @format_docstring() def api_snapshot(request, snapshot_id): """ .. http:get:: /api/1/snapshot/(snapshot_id)/ Get information about a snapshot in the archive. A snapshot is a set of named branches, which are pointers to objects at any level of the Software Heritage DAG. It represents a full picture of an origin at a given time. As well as pointing to other objects in the Software Heritage DAG, branches can also be aliases, in which case their target is the name of another branch in the same snapshot, or dangling, in which case the target is unknown. A snapshot identifier is a salted sha1. See :func:`swh.model.identifiers.snapshot_identifier` in our data model module for details about how they are computed. :param sha1 snapshot_id: a snapshot identifier :query str branches_from: optional parameter used to skip branches whose name is lesser than it before returning them :query int branches_count: optional parameter used to restrain the amount of returned branches (default to 1000) :query str target_types: optional comma separated list parameter used to filter the target types of branch to return (possible values that can be contained in that list are ``content``, ``directory``, ``revision``, ``release``, ``snapshot`` or ``alias``) {common_headers} {resheader_link} :>json object branches: object containing all branches associated to the snapshot,for each of them the associated target type and id are given but also a link to get information about that target :>json string id: the unique identifier of the snapshot **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid snapshot identifier has been provided :statuscode 404: requested snapshot can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`snapshot/6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a/` """ def _enrich_snapshot(snapshot): s = snapshot.copy() if 'branches' in s: s['branches'] = { k: utils.enrich_object(v) if v else None for k, v in s['branches'].items() } for k, v in s['branches'].items(): if v and v['target_type'] == 'alias': if v['target'] in s['branches']: branch_alias = s['branches'][v['target']] if branch_alias: v['target_url'] = branch_alias['target_url'] else: snp = \ service.lookup_snapshot(s['id'], branches_from=v['target'], branches_count=1) if snp and v['target'] in snp['branches']: branch = snp['branches'][v['target']] branch = utils.enrich_object(branch) v['target_url'] = branch['target_url'] return s snapshot_content_max_size = get_config()['snapshot_content_max_size'] branches_from = request.GET.get('branches_from', '') branches_count = int(request.GET.get('branches_count', snapshot_content_max_size)) target_types = request.GET.get('target_types', None) target_types = target_types.split(',') if target_types else None results = api_lookup( service.lookup_snapshot, snapshot_id, branches_from, branches_count, target_types, notfound_msg='Snapshot with id {} not found.'.format(snapshot_id), enrich_fn=_enrich_snapshot) response = {'results': results, 'headers': {}} if results['next_branch'] is not None: response['headers']['link-next'] = \ reverse('api-1-snapshot', url_args={'snapshot_id': snapshot_id}, query_params={'branches_from': results['next_branch'], 'branches_count': branches_count, 'target_types': target_types}) return response diff --git a/swh/web/api/views/stat.py b/swh/web/api/views/stat.py index 608c2e25..edddcc96 100644 --- a/swh/web/api/views/stat.py +++ b/swh/web/api/views/stat.py @@ -1,53 +1,53 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.web.common import service from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route @api_route(r'/stat/counters/', 'api-1-stat-counters') @api_doc('/stat/counters/', noargs=True) @format_docstring() def api_stats(request): """ .. http:get:: /api/1/stat/counters/ Get statistics about the content of the archive. :>json number content: current number of content objects (aka files) in the archive :>json number directory: current number of directory objects in the archive :>json number origin: current number of software origins (an origin is a "place" where code source can be found, e.g. a git repository, a tarball, ...) in the archive :>json number origin_visit: current number of visits on software origins to fill the archive :>json number person: current number of persons (code source authors or committers) in the archive :>json number release: current number of releases objects in the archive :>json number revision: current number of revision objects (aka commits) in the archive :>json number skipped_content: current number of content objects (aka files) which where not inserted in the archive :>json number snapshot: current number of snapshot objects (aka set of named branches) in the archive {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error **Example:** .. parsed-literal:: :swh_web_api:`stat/counters/` """ return service.stat_counters() diff --git a/swh/web/api/views/vault.py b/swh/web/api/views/vault.py index 75c12fde..0b62daba 100644 --- a/swh/web/api/views/vault.py +++ b/swh/web/api/views/vault.py @@ -1,240 +1,240 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.http import HttpResponse from django.shortcuts import redirect from django.views.decorators.cache import never_cache from swh.model import hashutil from swh.web.common import service, query from swh.web.common.utils import reverse from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route from swh.web.api.views.utils import api_lookup # XXX: a bit spaghetti. Would be better with class-based views. def _dispatch_cook_progress(request, obj_type, obj_id): hex_id = hashutil.hash_to_hex(obj_id) object_name = obj_type.split('_')[0].title() if request.method == 'GET': return api_lookup( service.vault_progress, obj_type, obj_id, notfound_msg=("{} '{}' was never requested." .format(object_name, hex_id))) elif request.method == 'POST': email = request.POST.get('email', request.GET.get('email', None)) return api_lookup( service.vault_cook, obj_type, obj_id, email, notfound_msg=("{} '{}' not found." .format(object_name, hex_id))) @api_route(r'/vault/directory/(?P[0-9a-f]+)/', 'api-1-vault-cook-directory', methods=['GET', 'POST'], checksum_args=['dir_id'], throttle_scope='swh_vault_cooking') @never_cache @api_doc('/vault/directory/') @format_docstring() def api_vault_cook_directory(request, dir_id): """ .. http:get:: /api/1/vault/directory/(dir_id)/ .. http:post:: /api/1/vault/directory/(dir_id)/ Request the cooking of an archive for a directory or check its cooking status. That endpoint enables to create a vault cooking task for a directory through a POST request or check the status of a previously created one through a GET request. Once the cooking task has been executed, the resulting archive can be downloaded using the dedicated endpoint - :http:get:`/api/1/vault/directory/(dir_id)/raw/`. + :http:get:`/api/1/vault/directory/(dir_id)/raw/`. Then to extract the cooked directory in the current one, use:: $ tar xvf path/to/directory.tar.gz :param string dir_id: the directory's sha1 identifier :query string email: e-mail to notify when the archive is ready {common_headers} :>json string fetch_url: the url from which to download the archive once it has been cooked (see :http:get:`/api/1/vault/directory/(dir_id)/raw/`) :>json string obj_type: the type of object to cook (directory or revision) :>json string progress_message: message describing the cooking task progress :>json number id: the cooking task id :>json string status: the cooking task status (either **new**, **pending**, **done** or **failed**) :>json string obj_id: the identifier of the object to cook **Allowed HTTP Methods:** :http:method:`get`, :http:method:`post`, - :http:method:`head`, :http:method:`options` + :http:method:`head`, :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid directory identifier has been provided :statuscode 404: requested directory can not be found in the archive """ _, obj_id = query.parse_hash_with_algorithms_or_throws( dir_id, ['sha1'], 'Only sha1_git is supported.') res = _dispatch_cook_progress(request, 'directory', obj_id) res['fetch_url'] = reverse('api-1-vault-fetch-directory', url_args={'dir_id': dir_id}) return res @api_route(r'/vault/directory/(?P[0-9a-f]+)/raw/', 'api-1-vault-fetch-directory', checksum_args=['dir_id']) @api_doc('/vault/directory/raw/', handle_response=True) def api_vault_fetch_directory(request, dir_id): """ .. http:get:: /api/1/vault/directory/(dir_id)/raw/ Fetch the cooked archive for a directory. See :http:get:`/api/1/vault/directory/(dir_id)/` to get more details on directory cooking. :param string dir_id: the directory's sha1 identifier :resheader Content-Type: application/octet-stream **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid directory identifier has been provided :statuscode 404: requested directory can not be found in the archive """ _, obj_id = query.parse_hash_with_algorithms_or_throws( dir_id, ['sha1'], 'Only sha1_git is supported.') res = api_lookup( service.vault_fetch, 'directory', obj_id, notfound_msg="Directory with ID '{}' not found.".format(dir_id)) fname = '{}.tar.gz'.format(dir_id) response = HttpResponse(res, content_type='application/gzip') response['Content-disposition'] = 'attachment; filename={}'.format(fname) return response @api_route(r'/vault/revision/(?P[0-9a-f]+)/gitfast/', 'api-1-vault-cook-revision_gitfast', methods=['GET', 'POST'], checksum_args=['rev_id'], throttle_scope='swh_vault_cooking') @never_cache @api_doc('/vault/revision/gitfast/') @format_docstring() def api_vault_cook_revision_gitfast(request, rev_id): """ .. http:get:: /api/1/vault/revision/(rev_id)/gitfast/ .. http:post:: /api/1/vault/revision/(rev_id)/gitfast/ Request the cooking of a gitfast archive for a revision or check its cooking status. That endpoint enables to create a vault cooking task for a revision through a POST request or check the status of a previously created one through a GET request. Once the cooking task has been executed, the resulting gitfast archive can be downloaded using the dedicated endpoint - :http:get:`/api/1/vault/revision/(rev_id)/gitfast/raw/`. + :http:get:`/api/1/vault/revision/(rev_id)/gitfast/raw/`. Then to import the revision in the current directory, use:: $ git init $ zcat path/to/revision.gitfast.gz | git fast-import $ git checkout HEAD :param string rev_id: the revision's sha1 identifier :query string email: e-mail to notify when the gitfast archive is ready {common_headers} :>json string fetch_url: the url from which to download the archive once it has been cooked (see :http:get:`/api/1/vault/revision/(rev_id)/gitfast/raw/`) :>json string obj_type: the type of object to cook (directory or revision) :>json string progress_message: message describing the cooking task progress :>json number id: the cooking task id :>json string status: the cooking task status (new/pending/done/failed) :>json string obj_id: the identifier of the object to cook **Allowed HTTP Methods:** :http:method:`get`, :http:method:`post`, - :http:method:`head`, :http:method:`options` + :http:method:`head`, :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid revision identifier has been provided :statuscode 404: requested revision can not be found in the archive """ _, obj_id = query.parse_hash_with_algorithms_or_throws( rev_id, ['sha1'], 'Only sha1_git is supported.') res = _dispatch_cook_progress(request, 'revision_gitfast', obj_id) res['fetch_url'] = reverse('api-1-vault-fetch-revision_gitfast', url_args={'rev_id': rev_id}) return res @api_route(r'/vault/revision/(?P[0-9a-f]+)/gitfast/raw/', 'api-1-vault-fetch-revision_gitfast', checksum_args=['rev_id']) @api_doc('/vault/revision/gitfast/raw/', handle_response=True) def api_vault_fetch_revision_gitfast(request, rev_id): """ .. http:get:: /api/1/vault/revision/(rev_id)/gitfast/raw/ Fetch the cooked gitfast archive for a revision. See :http:get:`/api/1/vault/revision/(rev_id)/gitfast/` to get more details on directory cooking. :param string rev_id: the revision's sha1 identifier :resheader Content-Type: application/octet-stream **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, - :http:method:`options` + :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid revision identifier has been provided :statuscode 404: requested revision can not be found in the archive """ _, obj_id = query.parse_hash_with_algorithms_or_throws( rev_id, ['sha1'], 'Only sha1_git is supported.') res = api_lookup( service.vault_fetch, 'revision_gitfast', obj_id, notfound_msg="Revision with ID '{}' not found.".format(rev_id)) fname = '{}.gitfast.gz'.format(rev_id) response = HttpResponse(res, content_type='application/gzip') response['Content-disposition'] = 'attachment; filename={}'.format(fname) return response @api_route(r'/vault/revision_gitfast/(?P[0-9a-f]+)/raw/', 'api-1-vault-revision_gitfast-raw', checksum_args=['rev_id']) @api_doc('/vault/revision_gitfast/raw/', tags=['hidden'], handle_response=True) def _api_vault_revision_gitfast_raw(request, rev_id): """ The vault backend sends an email containing an invalid url to fetch a gitfast archive. So setup a redirection to the correct one as a temporary workaround. """ rev_gitfast_raw_url = reverse('api-1-vault-fetch-revision_gitfast', url_args={'rev_id': rev_id}) return redirect(rev_gitfast_raw_url) diff --git a/swh/web/browse/utils.py b/swh/web/browse/utils.py index 45e5369c..0962f604 100644 --- a/swh/web/browse/utils.py +++ b/swh/web/browse/utils.py @@ -1,1111 +1,1111 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import magic import pypandoc import stat import textwrap from collections import defaultdict from threading import Lock from django.core.cache import cache from django.utils.safestring import mark_safe from django.utils.html import escape from swh.model.identifiers import persistent_identifier from swh.web.common import highlightjs, service from swh.web.common.exc import NotFoundExc, http_status_code_message from swh.web.common.origin_visits import get_origin_visit from swh.web.common.utils import ( reverse, format_utc_iso_date, get_swh_persistent_id, swh_object_icons ) from swh.web.config import get_config def get_directory_entries(sha1_git): """Function that retrieves the content of a directory from the archive. The directories entries are first sorted in lexicographical order. Sub-directories and regular files are then extracted. Args: sha1_git: sha1_git identifier of the directory Returns: A tuple whose first member corresponds to the sub-directories list and second member the regular files list Raises: NotFoundExc if the directory is not found """ cache_entry_id = 'directory_entries_%s' % sha1_git cache_entry = cache.get(cache_entry_id) if cache_entry: return cache_entry entries = list(service.lookup_directory(sha1_git)) for e in entries: e['perms'] = stat.filemode(e['perms']) if e['type'] == 'rev': # modify dir entry name to explicitly show it points # to a revision e['name'] = '%s @ %s' % (e['name'], e['target'][:7]) dirs = [e for e in entries if e['type'] in ('dir', 'rev')] files = [e for e in entries if e['type'] == 'file'] dirs = sorted(dirs, key=lambda d: d['name']) files = sorted(files, key=lambda f: f['name']) cache.set(cache_entry_id, (dirs, files)) return dirs, files _lock = Lock() def get_mimetype_and_encoding_for_content(content): """Function that returns the mime type and the encoding associated to a content buffer using the magic module under the hood. Args: content (bytes): a content buffer Returns: A tuple (mimetype, encoding), for instance ('text/plain', 'us-ascii'), associated to the provided content. """ # https://pypi.org/project/python-magic/ # packaged as python3-magic in debian buster if hasattr(magic, 'from_buffer'): m = magic.Magic(mime=True, mime_encoding=True) mime_encoding = m.from_buffer(content) mime_type, encoding = mime_encoding.split(';') encoding = encoding.replace(' charset=', '') # https://pypi.org/project/file-magic/ # packaged as python3-magic in debian stretch else: # TODO: Remove that code when production environment is upgraded # to debian buster # calls to the file-magic API are not thread-safe so they must # be protected with a Lock to guarantee they will succeed _lock.acquire() magic_result = magic.detect_from_content(content) _lock.release() mime_type = magic_result.mime_type encoding = magic_result.encoding return mime_type, encoding # maximum authorized content size in bytes for HTML display # with code highlighting content_display_max_size = get_config()['content_display_max_size'] snapshot_content_max_size = get_config()['snapshot_content_max_size'] def _re_encode_content(mimetype, encoding, content_data): # encode textual content to utf-8 if needed if mimetype.startswith('text/'): # probably a malformed UTF-8 content, re-encode it # by replacing invalid chars with a substitution one if encoding == 'unknown-8bit': content_data = content_data.decode('utf-8', 'replace')\ .encode('utf-8') elif encoding not in ['utf-8', 'binary']: content_data = content_data.decode(encoding, 'replace')\ .encode('utf-8') elif mimetype.startswith('application/octet-stream'): # file may detect a text content as binary # so try to decode it for display encodings = ['us-ascii'] encodings += ['iso-8859-%s' % i for i in range(1, 17)] for encoding in encodings: try: content_data = content_data.decode(encoding)\ .encode('utf-8') except Exception: pass else: # ensure display in content view mimetype = 'text/plain' break return mimetype, content_data def request_content(query_string, max_size=content_display_max_size, raise_if_unavailable=True, re_encode=True): """Function that retrieves a content from the archive. Raw bytes content is first retrieved, then the content mime type. If the mime type is not stored in the archive, it will be computed using Python magic module. Args: query_string: a string of the form "[ALGO_HASH:]HASH" where optional ALGO_HASH can be either ``sha1``, ``sha1_git``, ``sha256``, or ``blake2s256`` (default to ``sha1``) and HASH the hexadecimal representation of the hash value max_size: the maximum size for a content to retrieve (default to 1MB, no size limit if None) Returns: A tuple whose first member corresponds to the content raw bytes and second member the content mime type Raises: NotFoundExc if the content is not found """ content_data = service.lookup_content(query_string) filetype = None language = None license = None # requests to the indexer db may fail so properly handle # those cases in order to avoid content display errors try: filetype = service.lookup_content_filetype(query_string) language = service.lookup_content_language(query_string) license = service.lookup_content_license(query_string) except Exception: pass mimetype = 'unknown' encoding = 'unknown' if filetype: mimetype = filetype['mimetype'] encoding = filetype['encoding'] # workaround when encountering corrupted data due to implicit # conversion from bytea to text in the indexer db (see T818) # TODO: Remove that code when all data have been correctly converted if mimetype.startswith('\\'): filetype = None content_data['error_code'] = 200 content_data['error_message'] = '' content_data['error_description'] = '' if not max_size or content_data['length'] < max_size: try: content_raw = service.lookup_content_raw(query_string) except Exception as e: if raise_if_unavailable: raise e else: content_data['raw_data'] = None content_data['error_code'] = 404 content_data['error_description'] = \ 'The bytes of the content are currently not available in the archive.' # noqa content_data['error_message'] = \ http_status_code_message[content_data['error_code']] else: content_data['raw_data'] = content_raw['data'] if not filetype: mimetype, encoding = \ get_mimetype_and_encoding_for_content(content_data['raw_data']) # noqa if re_encode: mimetype, raw_data = _re_encode_content( mimetype, encoding, content_data['raw_data']) content_data['raw_data'] = raw_data else: content_data['raw_data'] = None content_data['mimetype'] = mimetype content_data['encoding'] = encoding if language: content_data['language'] = language['lang'] else: content_data['language'] = 'not detected' if license: content_data['licenses'] = ', '.join(license['facts'][0]['licenses']) else: content_data['licenses'] = 'not detected' return content_data _browsers_supported_image_mimes = set(['image/gif', 'image/png', 'image/jpeg', 'image/bmp', 'image/webp', 'image/svg', 'image/svg+xml']) def prepare_content_for_display(content_data, mime_type, path): """Function that prepares a content for HTML display. The function tries to associate a programming language to a content in order to perform syntax highlighting client-side using highlightjs. The language is determined using either the content filename or its mime type. If the mime type corresponds to an image format supported by web browsers, the content will be encoded in base64 for displaying the image. Args: content_data (bytes): raw bytes of the content mime_type (string): mime type of the content path (string): path of the content including filename Returns: A dict containing the content bytes (possibly different from the one provided as parameter if it is an image) under the key 'content_data and the corresponding highlightjs language class under the key 'language'. """ language = highlightjs.get_hljs_language_from_filename(path) if not language: language = highlightjs.get_hljs_language_from_mime_type(mime_type) if not language: language = 'nohighlight' elif mime_type.startswith('application/'): mime_type = mime_type.replace('application/', 'text/') if mime_type.startswith('image/'): if mime_type in _browsers_supported_image_mimes: content_data = base64.b64encode(content_data) content_data = content_data.decode('utf-8') else: content_data = None if mime_type.startswith('image/svg'): mime_type = 'image/svg+xml' return {'content_data': content_data, 'language': language, 'mimetype': mime_type} def process_snapshot_branches(snapshot): """ Process a dictionary describing snapshot branches: extract those targeting revisions and releases, put them in two different lists, then sort those lists in lexicographical order of the branches' names. Args: snapshot_branches (dict): A dict describing the branches of a snapshot as returned for instance by :func:`swh.web.common.service.lookup_snapshot` Returns: tuple: A tuple whose first member is the sorted list of branches targeting revisions and second member the sorted list of branches targeting releases """ snapshot_branches = snapshot['branches'] branches = {} branch_aliases = {} releases = {} revision_to_branch = defaultdict(set) revision_to_release = defaultdict(set) release_to_branch = defaultdict(set) for branch_name, target in snapshot_branches.items(): if not target: # FIXME: display branches with an unknown target anyway continue target_id = target['target'] target_type = target['target_type'] if target_type == 'revision': branches[branch_name] = { 'name': branch_name, 'revision': target_id, } revision_to_branch[target_id].add(branch_name) elif target_type == 'release': release_to_branch[target_id].add(branch_name) elif target_type == 'alias': branch_aliases[branch_name] = target_id # FIXME: handle pointers to other object types def _enrich_release_branch(branch, release): releases[branch] = { 'name': release['name'], 'branch_name': branch, 'date': format_utc_iso_date(release['date']), 'id': release['id'], 'message': release['message'], 'target_type': release['target_type'], 'target': release['target'], } def _enrich_revision_branch(branch, revision): branches[branch].update({ 'revision': revision['id'], 'directory': revision['directory'], 'date': format_utc_iso_date(revision['date']), 'message': revision['message'] }) releases_info = service.lookup_release_multiple( release_to_branch.keys() ) for release in releases_info: branches_to_update = release_to_branch[release['id']] for branch in branches_to_update: _enrich_release_branch(branch, release) if release['target_type'] == 'revision': revision_to_release[release['target']].update( branches_to_update ) revisions = service.lookup_revision_multiple( set(revision_to_branch.keys()) | set(revision_to_release.keys()) ) for revision in revisions: if not revision: continue for branch in revision_to_branch[revision['id']]: _enrich_revision_branch(branch, revision) for release in revision_to_release[revision['id']]: releases[release]['directory'] = revision['directory'] for branch_alias, branch_target in branch_aliases.items(): if branch_target in branches: branches[branch_alias] = dict(branches[branch_target]) else: snp = service.lookup_snapshot(snapshot['id'], branches_from=branch_target, branches_count=1) if snp and branch_target in snp['branches']: if snp['branches'][branch_target] is None: continue target_type = snp['branches'][branch_target]['target_type'] target = snp['branches'][branch_target]['target'] if target_type == 'revision': branches[branch_alias] = snp['branches'][branch_target] revision = service.lookup_revision(target) _enrich_revision_branch(branch_alias, revision) elif target_type == 'release': release = service.lookup_release(target) _enrich_release_branch(branch_alias, release) if branch_alias in branches: branches[branch_alias]['name'] = branch_alias ret_branches = list(sorted(branches.values(), key=lambda b: b['name'])) ret_releases = list(sorted(releases.values(), key=lambda b: b['name'])) return ret_branches, ret_releases def get_snapshot_content(snapshot_id): """Returns the lists of branches and releases associated to a swh snapshot. That list is put in cache in order to speedup the navigation in the swh-web/browse ui. .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. Args: snapshot_id (str): hexadecimal representation of the snapshot identifier Returns: A tuple with two members. The first one is a list of dict describing the snapshot branches. The second one is a list of dict describing the snapshot releases. Raises: NotFoundExc if the snapshot does not exist """ cache_entry_id = 'swh_snapshot_%s' % snapshot_id cache_entry = cache.get(cache_entry_id) if cache_entry: return cache_entry['branches'], cache_entry['releases'] branches = [] releases = [] if snapshot_id: snapshot = service.lookup_snapshot( snapshot_id, branches_count=snapshot_content_max_size) branches, releases = process_snapshot_branches(snapshot) cache.set(cache_entry_id, { 'branches': branches, 'releases': releases, }) return branches, releases def get_origin_visit_snapshot(origin_info, visit_ts=None, visit_id=None, snapshot_id=None): """Returns the lists of branches and releases associated to a swh origin for a given visit. The visit is expressed by a timestamp. In the latter case, the closest visit from the provided timestamp will be used. If no visit parameter is provided, it returns the list of branches found for the latest visit. That list is put in cache in order to speedup the navigation in the swh-web/browse ui. .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. Args: origin_info (dict): a dict filled with origin information (id, url, type) visit_ts (int or str): an ISO date string or Unix timestamp to parse visit_id (int): optional visit id for disambiguation in case several visits have the same timestamp Returns: A tuple with two members. The first one is a list of dict describing the origin branches for the given visit. The second one is a list of dict describing the origin releases for the given visit. Raises: NotFoundExc if the origin or its visit are not found """ visit_info = get_origin_visit(origin_info, visit_ts, visit_id, snapshot_id) return get_snapshot_content(visit_info['snapshot']) def gen_link(url, link_text=None, link_attrs=None): """ Utility function for generating an HTML link to insert in Django templates. Args: url (str): an url link_text (str): optional text for the produced link, if not provided the url will be used link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ attrs = ' ' if link_attrs: for k, v in link_attrs.items(): attrs += '%s="%s" ' % (k, v) if not link_text: link_text = url link = '%s' \ % (attrs, escape(url), escape(link_text)) return mark_safe(link) def _snapshot_context_query_params(snapshot_context): query_params = None if snapshot_context and snapshot_context['origin_info']: origin_info = snapshot_context['origin_info'] query_params = {'origin': origin_info['url']} if 'timestamp' in snapshot_context['url_args']: query_params['timestamp'] = \ snapshot_context['url_args']['timestamp'] if 'visit_id' in snapshot_context['query_params']: query_params['visit_id'] = \ snapshot_context['query_params']['visit_id'] elif snapshot_context: query_params = {'snapshot_id': snapshot_context['snapshot_id']} return query_params def gen_revision_url(revision_id, snapshot_context=None): """ Utility function for generating an url to a revision. Args: revision_id (str): a revision id snapshot_context (dict): if provided, generate snapshot-dependent browsing url Returns: str: The url to browse the revision """ query_params = _snapshot_context_query_params(snapshot_context) return reverse('browse-revision', url_args={'sha1_git': revision_id}, query_params=query_params) def gen_revision_link(revision_id, shorten_id=False, snapshot_context=None, link_text='Browse', link_attrs={'class': 'btn btn-default btn-sm', 'role': 'button'}): """ Utility function for generating a link to a revision HTML view to insert in Django templates. Args: revision_id (str): a revision id shorten_id (boolean): whether to shorten the revision id to 7 characters for the link text snapshot_context (dict): if provided, generate snapshot-dependent browsing link link_text (str): optional text for the generated link (the revision id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: str: An HTML link in the form 'revision_id' """ if not revision_id: return None revision_url = gen_revision_url(revision_id, snapshot_context) if shorten_id: return gen_link(revision_url, revision_id[:7], link_attrs) else: if not link_text: link_text = revision_id return gen_link(revision_url, link_text, link_attrs) def gen_directory_link(sha1_git, snapshot_context=None, link_text='Browse', link_attrs={'class': 'btn btn-default btn-sm', 'role': 'button'}): """ Utility function for generating a link to a directory HTML view to insert in Django templates. Args: sha1_git (str): directory identifier link_text (str): optional text for the generated link (the directory id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ if not sha1_git: return None query_params = _snapshot_context_query_params(snapshot_context) directory_url = reverse('browse-directory', url_args={'sha1_git': sha1_git}, query_params=query_params) if not link_text: link_text = sha1_git return gen_link(directory_url, link_text, link_attrs) def gen_snapshot_link(snapshot_id, snapshot_context=None, link_text='Browse', link_attrs={'class': 'btn btn-default btn-sm', 'role': 'button'}): """ Utility function for generating a link to a snapshot HTML view to insert in Django templates. Args: snapshot_id (str): snapshot identifier link_text (str): optional text for the generated link (the snapshot id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ query_params = _snapshot_context_query_params(snapshot_context) snapshot_url = reverse('browse-snapshot', url_args={'snapshot_id': snapshot_id}, query_params=query_params) if not link_text: link_text = snapshot_id return gen_link(snapshot_url, link_text, link_attrs) def gen_content_link(sha1_git, snapshot_context=None, link_text='Browse', link_attrs={'class': 'btn btn-default btn-sm', 'role': 'button'}): """ Utility function for generating a link to a content HTML view to insert in Django templates. Args: sha1_git (str): content identifier link_text (str): optional text for the generated link (the content sha1_git will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ if not sha1_git: return None query_params = _snapshot_context_query_params(snapshot_context) content_url = reverse('browse-content', url_args={'query_string': 'sha1_git:' + sha1_git}, query_params=query_params) if not link_text: link_text = sha1_git return gen_link(content_url, link_text, link_attrs) def get_revision_log_url(revision_id, snapshot_context=None): """ Utility function for getting the URL for a revision log HTML view (possibly in the context of an origin). Args: revision_id (str): revision identifier the history heads to snapshot_context (dict): if provided, generate snapshot-dependent browsing link Returns: The revision log view URL """ query_params = {'revision': revision_id} if snapshot_context and snapshot_context['origin_info']: origin_info = snapshot_context['origin_info'] url_args = {'origin_url': origin_info['url']} if 'timestamp' in snapshot_context['url_args']: url_args['timestamp'] = \ snapshot_context['url_args']['timestamp'] if 'visit_id' in snapshot_context['query_params']: query_params['visit_id'] = \ snapshot_context['query_params']['visit_id'] revision_log_url = reverse('browse-origin-log', url_args=url_args, query_params=query_params) elif snapshot_context: url_args = {'snapshot_id': snapshot_context['snapshot_id']} revision_log_url = reverse('browse-snapshot-log', url_args=url_args, query_params=query_params) else: revision_log_url = reverse('browse-revision-log', url_args={'sha1_git': revision_id}) return revision_log_url def gen_revision_log_link(revision_id, snapshot_context=None, link_text='Browse', link_attrs={'class': 'btn btn-default btn-sm', 'role': 'button'}): """ Utility function for generating a link to a revision log HTML view (possibly in the context of an origin) to insert in Django templates. Args: revision_id (str): revision identifier the history heads to snapshot_context (dict): if provided, generate snapshot-dependent browsing link link_text (str): optional text to use for the generated link (the revision id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ if not revision_id: return None revision_log_url = get_revision_log_url(revision_id, snapshot_context) if not link_text: link_text = revision_id return gen_link(revision_log_url, link_text, link_attrs) def gen_release_link(sha1_git, snapshot_context=None, link_text='Browse', link_attrs={'class': 'btn btn-default btn-sm', 'role': 'button'}): """ Utility function for generating a link to a release HTML view to insert in Django templates. Args: sha1_git (str): release identifier link_text (str): optional text for the generated link (the release id will be used by default) link_attrs (dict): optional attributes (e.g. class) to add to the link Returns: An HTML link in the form 'link_text' """ query_params = _snapshot_context_query_params(snapshot_context) release_url = reverse('browse-release', url_args={'sha1_git': sha1_git}, query_params=query_params) if not link_text: link_text = sha1_git return gen_link(release_url, link_text, link_attrs) def format_log_entries(revision_log, per_page, snapshot_context=None): """ Utility functions that process raw revision log data for HTML display. Its purpose is to: * add links to relevant browse views * format date in human readable format * truncate the message log Args: revision_log (list): raw revision log as returned by the swh-web api per_page (int): number of log entries per page snapshot_context (dict): if provided, generate snapshot-dependent browsing link """ revision_log_data = [] for i, rev in enumerate(revision_log): if i == per_page: break author_name = 'None' author_fullname = 'None' committer_fullname = 'None' if rev['author']: author_name = rev['author']['name'] or rev['author']['fullname'] author_fullname = rev['author']['fullname'] if rev['committer']: committer_fullname = rev['committer']['fullname'] author_date = format_utc_iso_date(rev['date']) committer_date = format_utc_iso_date(rev['committer_date']) tooltip = 'revision %s\n' % rev['id'] tooltip += 'author: %s\n' % author_fullname tooltip += 'author date: %s\n' % author_date tooltip += 'committer: %s\n' % committer_fullname tooltip += 'committer date: %s\n\n' % committer_date if rev['message']: tooltip += textwrap.indent(rev['message'], ' '*4) revision_log_data.append({ 'author': author_name, 'id': rev['id'][:7], 'message': rev['message'], 'date': author_date, 'commit_date': committer_date, 'url': gen_revision_url(rev['id'], snapshot_context), 'tooltip': tooltip }) return revision_log_data # list of origin types that can be found in the swh archive # TODO: retrieve it dynamically in an efficient way instead # of hardcoding it _swh_origin_types = ['git', 'svn', 'deb', 'hg', 'ftp', 'deposit', 'pypi', 'npm'] def get_origin_info(origin_url, origin_type=None): """ Get info about a software origin. Its main purpose is to automatically find an origin type when it is not provided as parameter. Args: origin_url (str): complete url of a software origin origin_type (str): optional origin type Returns: A dict with the following entries: * type: the origin type * url: the origin url * id: the internal id of the origin """ if origin_type: return service.lookup_origin({'type': origin_type, 'url': origin_url}) else: for origin_type in _swh_origin_types: try: origin_info = service.lookup_origin({'type': origin_type, 'url': origin_url}) return origin_info except Exception: pass raise NotFoundExc('Origin with url %s not found!' % escape(origin_url)) def get_snapshot_context(snapshot_id=None, origin_type=None, origin_url=None, timestamp=None, visit_id=None): """ Utility function to compute relevant information when navigating the archive in a snapshot context. The snapshot is either referenced by its id or it will be retrieved from an origin visit. Args: snapshot_id (str): hexadecimal representation of a snapshot identifier, all other parameters will be ignored if it is provided origin_type (str): the origin type (git, svn, deposit, ...) origin_url (str): the origin_url (e.g. https://github.com/(user)/(repo)/) timestamp (str): a datetime string for retrieving the closest visit of the origin visit_id (int): optional visit id for disambiguation in case of several visits with the same timestamp Returns: A dict with the following entries: * origin_info: dict containing origin information * visit_info: dict containing visit information * branches: the list of branches for the origin found during the visit * releases: the list of releases for the origin found during the visit * origin_browse_url: the url to browse the origin * origin_branches_url: the url to browse the origin branches * origin_releases_url': the url to browse the origin releases * origin_visit_url: the url to browse the snapshot of the origin found during the visit * url_args: dict containing url arguments to use when browsing in the context of the origin and its visit Raises: NotFoundExc: if no snapshot is found for the visit of an origin. """ origin_info = None visit_info = None url_args = None query_params = {} branches = [] releases = [] browse_url = None visit_url = None branches_url = None releases_url = None swh_type = 'snapshot' if origin_url: swh_type = 'origin' origin_info = get_origin_info(origin_url, origin_type) visit_info = get_origin_visit(origin_info, timestamp, visit_id, snapshot_id) fmt_date = format_utc_iso_date(visit_info['date']) visit_info['fmt_date'] = fmt_date snapshot_id = visit_info['snapshot'] if not snapshot_id: raise NotFoundExc('No snapshot associated to the visit of origin ' '%s on %s' % (escape(origin_url), fmt_date)) # provided timestamp is not necessarily equals to the one # of the retrieved visit, so get the exact one in order # use it in the urls generated below if timestamp: timestamp = visit_info['date'] branches, releases = \ get_origin_visit_snapshot(origin_info, timestamp, visit_id, snapshot_id) url_args = {'origin_type': origin_type, 'origin_url': origin_info['url']} query_params = {'visit_id': visit_id} browse_url = reverse('browse-origin-visits', url_args=url_args) if timestamp: url_args['timestamp'] = format_utc_iso_date(timestamp, '%Y-%m-%dT%H:%M:%S') visit_url = reverse('browse-origin-directory', url_args=url_args, query_params=query_params) visit_info['url'] = visit_url branches_url = reverse('browse-origin-branches', url_args=url_args, query_params=query_params) releases_url = reverse('browse-origin-releases', url_args=url_args, query_params=query_params) elif snapshot_id: branches, releases = get_snapshot_content(snapshot_id) url_args = {'snapshot_id': snapshot_id} browse_url = reverse('browse-snapshot', url_args=url_args) branches_url = reverse('browse-snapshot-branches', url_args=url_args) releases_url = reverse('browse-snapshot-releases', url_args=url_args) releases = list(reversed(releases)) snapshot_size = service.lookup_snapshot_size(snapshot_id) is_empty = sum(snapshot_size.values()) == 0 swh_snp_id = persistent_identifier('snapshot', snapshot_id) return { 'swh_type': swh_type, 'swh_object_id': swh_snp_id, 'snapshot_id': snapshot_id, 'snapshot_size': snapshot_size, 'is_empty': is_empty, 'origin_info': origin_info, # keep track if the origin type was provided as url argument 'origin_type': origin_type, 'visit_info': visit_info, 'branches': branches, 'releases': releases, 'branch': None, 'release': None, 'browse_url': browse_url, 'branches_url': branches_url, 'releases_url': releases_url, 'url_args': url_args, 'query_params': query_params } # list of common readme names ordered by preference # (lower indices have higher priority) _common_readme_names = [ "readme.markdown", "readme.md", "readme.rst", "readme.txt", "readme" ] def get_readme_to_display(readmes): """ Process a list of readme files found in a directory in order to find the adequate one to display. Args: readmes: a list of dict where keys are readme file names and values are readme sha1s Returns: A tuple (readme_name, readme_sha1) """ readme_name = None readme_url = None readme_sha1 = None readme_html = None lc_readmes = {k.lower(): {'orig_name': k, 'sha1': v} for k, v in readmes.items()} # look for readme names according to the preference order # defined by the _common_readme_names list for common_readme_name in _common_readme_names: if common_readme_name in lc_readmes: readme_name = lc_readmes[common_readme_name]['orig_name'] readme_sha1 = lc_readmes[common_readme_name]['sha1'] readme_url = reverse('browse-content-raw', url_args={'query_string': readme_sha1}, query_params={'re_encode': 'true'}) break # otherwise pick the first readme like file if any if not readme_name and len(readmes.items()) > 0: readme_name = next(iter(readmes)) readme_sha1 = readmes[readme_name] readme_url = reverse('browse-content-raw', url_args={'query_string': readme_sha1}, query_params={'re_encode': 'true'}) # convert rst README to html server side as there is # no viable solution to perform that task client side if readme_name and readme_name.endswith('.rst'): cache_entry_id = 'readme_%s' % readme_sha1 cache_entry = cache.get(cache_entry_id) if cache_entry: readme_html = cache_entry else: try: rst_doc = request_content(readme_sha1) readme_html = pypandoc.convert_text(rst_doc['raw_data'], 'html', format='rst') cache.set(cache_entry_id, readme_html) except Exception: readme_html = 'Readme bytes are not available' return readme_name, readme_url, readme_html def get_swh_persistent_ids(swh_objects, snapshot_context=None): """ Returns a list of dict containing info related to persistent identifiers of swh objects. Args: swh_objects (list): a list of dict with the following keys: * type: swh object type - (content/directory/release/revision/snapshot) + (content/directory/release/revision/snapshot) * id: swh object id snapshot_context (dict): optional parameter describing the snapshot in which the object has been found Returns: list: a list of dict with the following keys: * object_type: the swh object type - (content/directory/release/revision/snapshot) + (content/directory/release/revision/snapshot) * object_icon: the swh object icon to use in HTML views * swh_id: the computed swh object persistent identifier * swh_id_url: the url resolving the persistent identifier * show_options: boolean indicating if the persistent id options must be displayed in persistent ids HTML view """ swh_ids = [] for swh_object in swh_objects: if not swh_object['id']: continue swh_id = get_swh_persistent_id(swh_object['type'], swh_object['id']) show_options = swh_object['type'] == 'content' or \ (snapshot_context and snapshot_context['origin_info'] is not None) object_icon = swh_object_icons[swh_object['type']] swh_ids.append({ 'object_type': swh_object['type'], 'object_icon': object_icon, 'swh_id': swh_id, 'swh_id_url': reverse('browse-swh-id', url_args={'swh_id': swh_id}), 'show_options': show_options }) return swh_ids diff --git a/swh/web/browse/views/content.py b/swh/web/browse/views/content.py index b8af32b8..f90550b0 100644 --- a/swh/web/browse/views/content.py +++ b/swh/web/browse/views/content.py @@ -1,331 +1,331 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import difflib import json from distutils.util import strtobool from django.http import HttpResponse from django.shortcuts import render from django.template.defaultfilters import filesizeformat from swh.model.hashutil import hash_to_hex from swh.web.common import query, service, highlightjs from swh.web.common.utils import ( reverse, gen_path_info, swh_object_icons ) from swh.web.common.exc import NotFoundExc, handle_view_exception from swh.web.browse.utils import ( request_content, prepare_content_for_display, content_display_max_size, get_snapshot_context, get_swh_persistent_ids, gen_link, gen_directory_link ) from swh.web.browse.browseurls import browse_route @browse_route(r'content/(?P[0-9a-z_:]*[0-9a-f]+.)/raw/', view_name='browse-content-raw', checksum_args=['query_string']) def content_raw(request, query_string): """Django view that produces a raw display of a content identified by its hash value. The url that points to it is - :http:get:`/browse/content/[(algo_hash):](hash)/raw/` + :http:get:`/browse/content/[(algo_hash):](hash)/raw/` """ try: re_encode = bool(strtobool(request.GET.get('re_encode', 'false'))) algo, checksum = query.parse_hash(query_string) checksum = hash_to_hex(checksum) content_data = request_content(query_string, max_size=None, re_encode=re_encode) except Exception as exc: return handle_view_exception(request, exc) filename = request.GET.get('filename', None) if not filename: filename = '%s_%s' % (algo, checksum) if content_data['mimetype'].startswith('text/') or \ content_data['mimetype'] == 'inode/x-empty': response = HttpResponse(content_data['raw_data'], content_type="text/plain") response['Content-disposition'] = 'filename=%s' % filename else: response = HttpResponse(content_data['raw_data'], content_type='application/octet-stream') response['Content-disposition'] = 'attachment; filename=%s' % filename return response _auto_diff_size_limit = 20000 @browse_route(r'content/(?P.*)/diff/(?P.*)', # noqa view_name='diff-contents') def _contents_diff(request, from_query_string, to_query_string): """ Browse endpoint used to compute unified diffs between two contents. Diffs are generated only if the two contents are textual. By default, diffs whose size are greater than 20 kB will not be generated. To force the generation of large diffs, the 'force' boolean query parameter must be used. Args: request: input django http request from_query_string: a string of the form "[ALGO_HASH:]HASH" where optional ALGO_HASH can be either ``sha1``, ``sha1_git``, ``sha256``, or ``blake2s256`` (default to ``sha1``) and HASH the hexadecimal representation of the hash value identifying the first content to_query_string: same as above for identifying the second content Returns: A JSON object containing the unified diff. """ diff_data = {} content_from = None content_to = None content_from_size = 0 content_to_size = 0 content_from_lines = [] content_to_lines = [] force = request.GET.get('force', 'false') path = request.GET.get('path', None) language = 'nohighlight' force = bool(strtobool(force)) if from_query_string == to_query_string: diff_str = 'File renamed without changes' else: try: text_diff = True if from_query_string: content_from = \ request_content(from_query_string, max_size=None) content_from_display_data = \ prepare_content_for_display(content_from['raw_data'], content_from['mimetype'], path) language = content_from_display_data['language'] content_from_size = content_from['length'] if not (content_from['mimetype'].startswith('text/') or content_from['mimetype'] == 'inode/x-empty'): text_diff = False if text_diff and to_query_string: content_to = request_content(to_query_string, max_size=None) content_to_display_data = prepare_content_for_display( content_to['raw_data'], content_to['mimetype'], path) language = content_to_display_data['language'] content_to_size = content_to['length'] if not (content_to['mimetype'].startswith('text/') or content_to['mimetype'] == 'inode/x-empty'): text_diff = False diff_size = abs(content_to_size - content_from_size) if not text_diff: diff_str = 'Diffs are not generated for non textual content' language = 'nohighlight' elif not force and diff_size > _auto_diff_size_limit: diff_str = 'Large diffs are not automatically computed' language = 'nohighlight' else: if content_from: content_from_lines = \ content_from['raw_data'].decode('utf-8')\ .splitlines(True) if content_from_lines and \ content_from_lines[-1][-1] != '\n': content_from_lines[-1] += '[swh-no-nl-marker]\n' if content_to: content_to_lines = content_to['raw_data'].decode('utf-8')\ .splitlines(True) if content_to_lines and content_to_lines[-1][-1] != '\n': content_to_lines[-1] += '[swh-no-nl-marker]\n' diff_lines = difflib.unified_diff(content_from_lines, content_to_lines) diff_str = ''.join(list(diff_lines)[2:]) except Exception as e: diff_str = str(e) diff_data['diff_str'] = diff_str diff_data['language'] = language diff_data_json = json.dumps(diff_data, separators=(',', ': ')) return HttpResponse(diff_data_json, content_type='application/json') @browse_route(r'content/(?P[0-9a-z_:]*[0-9a-f]+.)/', view_name='browse-content', checksum_args=['query_string']) def content_display(request, query_string): """Django view that produces an HTML display of a content identified by its hash value. The url that points to it is - :http:get:`/browse/content/[(algo_hash):](hash)/` + :http:get:`/browse/content/[(algo_hash):](hash)/` """ try: algo, checksum = query.parse_hash(query_string) checksum = hash_to_hex(checksum) content_data = request_content(query_string, raise_if_unavailable=False) origin_type = request.GET.get('origin_type', None) origin_url = request.GET.get('origin_url', None) selected_language = request.GET.get('language', None) if not origin_url: origin_url = request.GET.get('origin', None) snapshot_context = None if origin_url: try: snapshot_context = get_snapshot_context(None, origin_type, origin_url) except Exception: raw_cnt_url = reverse('browse-content', url_args={'query_string': query_string}) error_message = \ ('The Software Heritage archive has a content ' 'with the hash you provided but the origin ' 'mentioned in your request appears broken: %s. ' 'Please check the URL and try again.\n\n' 'Nevertheless, you can still browse the content ' 'without origin information: %s' % (gen_link(origin_url), gen_link(raw_cnt_url))) raise NotFoundExc(error_message) if snapshot_context: snapshot_context['visit_info'] = None except Exception as exc: return handle_view_exception(request, exc) path = request.GET.get('path', None) content = None language = None mimetype = None if content_data['raw_data'] is not None: content_display_data = prepare_content_for_display( content_data['raw_data'], content_data['mimetype'], path) content = content_display_data['content_data'] language = content_display_data['language'] mimetype = content_display_data['mimetype'] # Override language with user-selected language if selected_language is not None: language = selected_language available_languages = None if mimetype and 'text/' in mimetype: available_languages = highlightjs.get_supported_languages() root_dir = None filename = None path_info = None directory_id = None directory_url = None query_params = {'origin': origin_url} breadcrumbs = [] if path: split_path = path.split('/') root_dir = split_path[0] filename = split_path[-1] if root_dir != path: path = path.replace(root_dir + '/', '') path = path[:-len(filename)] path_info = gen_path_info(path) dir_url = reverse('browse-directory', url_args={'sha1_git': root_dir}, query_params=query_params) breadcrumbs.append({'name': root_dir[:7], 'url': dir_url}) for pi in path_info: dir_url = reverse('browse-directory', url_args={'sha1_git': root_dir, 'path': pi['path']}, query_params=query_params) breadcrumbs.append({'name': pi['name'], 'url': dir_url}) breadcrumbs.append({'name': filename, 'url': None}) if path and root_dir != path: dir_info = service.lookup_directory_with_path(root_dir, path) directory_id = dir_info['target'] elif root_dir != path: directory_id = root_dir if directory_id: directory_url = gen_directory_link(directory_id) query_params = {'filename': filename} content_raw_url = reverse('browse-content-raw', url_args={'query_string': query_string}, query_params=query_params) content_metadata = { 'sha1': content_data['checksums']['sha1'], 'sha1_git': content_data['checksums']['sha1_git'], 'sha256': content_data['checksums']['sha256'], 'blake2s256': content_data['checksums']['blake2s256'], 'mimetype': content_data['mimetype'], 'encoding': content_data['encoding'], 'size': filesizeformat(content_data['length']), 'language': content_data['language'], 'licenses': content_data['licenses'], 'filename': filename, 'directory': directory_id, 'context-independent directory': directory_url } if filename: content_metadata['filename'] = filename sha1_git = content_data['checksums']['sha1_git'] swh_ids = get_swh_persistent_ids([{'type': 'content', 'id': sha1_git}]) heading = 'Content - %s' % sha1_git if breadcrumbs: content_path = '/'.join([bc['name'] for bc in breadcrumbs]) heading += ' - %s' % content_path return render(request, 'browse/content.html', {'heading': heading, 'swh_object_id': swh_ids[0]['swh_id'], 'swh_object_name': 'Content', 'swh_object_metadata': content_metadata, 'content': content, 'content_size': content_data['length'], 'max_content_size': content_display_max_size, 'mimetype': mimetype, 'language': language, 'available_languages': available_languages, 'breadcrumbs': breadcrumbs, 'top_right_link': { 'url': content_raw_url, 'icon': swh_object_icons['content'], 'text': 'Raw File' }, 'snapshot_context': snapshot_context, 'vault_cooking': None, 'show_actions_menu': True, 'swh_ids': swh_ids, 'error_code': content_data['error_code'], 'error_message': content_data['error_message'], 'error_description': content_data['error_description']}, status=content_data['error_code']) diff --git a/swh/web/browse/views/directory.py b/swh/web/browse/views/directory.py index 7561ba39..8bc37d2a 100644 --- a/swh/web/browse/views/directory.py +++ b/swh/web/browse/views/directory.py @@ -1,178 +1,178 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from django.http import HttpResponse from django.shortcuts import render, redirect from django.template.defaultfilters import filesizeformat from swh.web.common import service from swh.web.common.utils import ( reverse, gen_path_info ) from swh.web.common.exc import handle_view_exception, NotFoundExc from swh.web.browse.utils import ( get_directory_entries, get_snapshot_context, get_readme_to_display, get_swh_persistent_ids, gen_link ) from swh.web.browse.browseurls import browse_route @browse_route(r'directory/(?P[0-9a-f]+)/', r'directory/(?P[0-9a-f]+)/(?P.+)/', view_name='browse-directory', checksum_args=['sha1_git']) def directory_browse(request, sha1_git, path=None): """Django view for browsing the content of a directory identified by its sha1_git value. The url that points to it is - :http:get:`/browse/directory/(sha1_git)/[(path)/]` + :http:get:`/browse/directory/(sha1_git)/[(path)/]` """ root_sha1_git = sha1_git try: if path: dir_info = service.lookup_directory_with_path(sha1_git, path) sha1_git = dir_info['target'] dirs, files = get_directory_entries(sha1_git) origin_type = request.GET.get('origin_type', None) origin_url = request.GET.get('origin_url', None) if not origin_url: origin_url = request.GET.get('origin', None) snapshot_context = None if origin_url: try: snapshot_context = get_snapshot_context(None, origin_type, origin_url) except Exception: raw_dir_url = reverse('browse-directory', url_args={'sha1_git': sha1_git}) error_message = \ ('The Software Heritage archive has a directory ' 'with the hash you provided but the origin ' 'mentioned in your request appears broken: %s. ' 'Please check the URL and try again.\n\n' 'Nevertheless, you can still browse the directory ' 'without origin information: %s' % (gen_link(origin_url), gen_link(raw_dir_url))) raise NotFoundExc(error_message) if snapshot_context: snapshot_context['visit_info'] = None except Exception as exc: return handle_view_exception(request, exc) path_info = gen_path_info(path) query_params = {'origin': origin_url} breadcrumbs = [] breadcrumbs.append({'name': root_sha1_git[:7], 'url': reverse('browse-directory', url_args={'sha1_git': root_sha1_git}, query_params=query_params)}) for pi in path_info: breadcrumbs.append({'name': pi['name'], 'url': reverse('browse-directory', url_args={'sha1_git': root_sha1_git, 'path': pi['path']}, query_params=query_params)}) path = '' if path is None else (path + '/') for d in dirs: if d['type'] == 'rev': d['url'] = reverse('browse-revision', url_args={'sha1_git': d['target']}, query_params=query_params) else: d['url'] = reverse('browse-directory', url_args={'sha1_git': root_sha1_git, 'path': path + d['name']}, query_params=query_params) sum_file_sizes = 0 readmes = {} for f in files: query_string = 'sha1_git:' + f['target'] f['url'] = reverse('browse-content', url_args={'query_string': query_string}, query_params={'path': root_sha1_git + '/' + path + f['name'], 'origin': origin_url}) if f['length'] is not None: sum_file_sizes += f['length'] f['length'] = filesizeformat(f['length']) if f['name'].lower().startswith('readme'): readmes[f['name']] = f['checksums']['sha1'] readme_name, readme_url, readme_html = get_readme_to_display(readmes) sum_file_sizes = filesizeformat(sum_file_sizes) dir_metadata = {'directory': sha1_git, 'number of regular files': len(files), 'number of subdirectories': len(dirs), 'sum of regular file sizes': sum_file_sizes} vault_cooking = { 'directory_context': True, 'directory_id': sha1_git, 'revision_context': False, 'revision_id': None } swh_ids = get_swh_persistent_ids([{'type': 'directory', 'id': sha1_git}]) heading = 'Directory - %s' % sha1_git if breadcrumbs: dir_path = '/'.join([bc['name'] for bc in breadcrumbs]) + '/' heading += ' - %s' % dir_path return render(request, 'browse/directory.html', {'heading': heading, 'swh_object_id': swh_ids[0]['swh_id'], 'swh_object_name': 'Directory', 'swh_object_metadata': dir_metadata, 'dirs': dirs, 'files': files, 'breadcrumbs': breadcrumbs, 'top_right_link': None, 'readme_name': readme_name, 'readme_url': readme_url, 'readme_html': readme_html, 'snapshot_context': snapshot_context, 'vault_cooking': vault_cooking, 'show_actions_menu': True, 'swh_ids': swh_ids}) @browse_route(r'directory/resolve/content-path/(?P[0-9a-f]+)/(?P.+)/', # noqa view_name='browse-directory-resolve-content-path', checksum_args=['sha1_git']) def _directory_resolve_content_path(request, sha1_git, path): """ Internal endpoint redirecting to data url for a specific file path relative to a root directory. """ try: path = os.path.normpath(path) if not path.startswith('../'): dir_info = service.lookup_directory_with_path(sha1_git, path) if dir_info['type'] == 'file': sha1 = dir_info['checksums']['sha1'] data_url = reverse('browse-content-raw', url_args={'query_string': sha1}) return redirect(data_url) except Exception: pass return HttpResponse(status=404) diff --git a/swh/web/browse/views/origin.py b/swh/web/browse/views/origin.py index 2e5f926e..9dec75ab 100644 --- a/swh/web/browse/views/origin.py +++ b/swh/web/browse/views/origin.py @@ -1,241 +1,241 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json from distutils.util import strtobool from django.http import HttpResponse from django.shortcuts import render, redirect from swh.web.common import service from swh.web.common.origin_visits import get_origin_visits from swh.web.common.utils import ( reverse, format_utc_iso_date, parse_timestamp ) from swh.web.common.exc import handle_view_exception from swh.web.browse.utils import ( get_origin_info, get_snapshot_context ) from swh.web.browse.browseurls import browse_route from .utils.snapshot_context import ( browse_snapshot_directory, browse_snapshot_content, browse_snapshot_log, browse_snapshot_branches, browse_snapshot_releases ) @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)' '/visit/(?P.+)/directory/', r'origin/(?P[a-z]+)/url/(?P.+)' '/visit/(?P.+)/directory/(?P.+)/', r'origin/(?P[a-z]+)/url/(?P.+)' '/directory/', r'origin/(?P[a-z]+)/url/(?P.+)' '/directory/(?P.+)/', r'origin/(?P.+)/visit/(?P.+)/directory/', r'origin/(?P.+)/visit/(?P.+)' '/directory/(?P.+)/', r'origin/(?P.+)/directory/', r'origin/(?P.+)/directory/(?P.+)/', view_name='browse-origin-directory') def origin_directory_browse(request, origin_url, origin_type=None, timestamp=None, path=None): """Django view for browsing the content of a directory associated to an origin for a given visit. The url scheme that points to it is the following: * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/directory/[(path)/]` * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visit/(timestamp)/directory/[(path)/]` """ # noqa return browse_snapshot_directory( request, origin_type=origin_type, origin_url=origin_url, timestamp=timestamp, path=path) @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)' '/visit/(?P.+)/content/(?P.+)/', r'origin/(?P[a-z]+)/url/(?P.+)' '/content/(?P.+)/', r'origin/(?P.+)/visit/(?P.+)' '/content/(?P.+)/', r'origin/(?P.+)/content/(?P.+)/', view_name='browse-origin-content') def origin_content_browse(request, origin_url, origin_type=None, path=None, timestamp=None): """Django view that produces an HTML display of a content associated to an origin for a given visit. The url scheme that points to it is the following: * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/content/(path)/` * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visit/(timestamp)/content/(path)/` """ # noqa language = request.GET.get('language', None) return browse_snapshot_content(request, origin_type=origin_type, origin_url=origin_url, timestamp=timestamp, path=path, selected_language=language) PER_PAGE = 20 @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)' '/visit/(?P.+)/log/', r'origin/(?P[a-z]+)/url/(?P.+)/log/', r'origin/(?P.+)/visit/(?P.+)/log/', r'origin/(?P.+)/log/', view_name='browse-origin-log') def origin_log_browse(request, origin_url, origin_type=None, timestamp=None): """Django view that produces an HTML display of revisions history (aka the commit log) associated to a software origin. The url scheme that points to it is the following: * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/log/` * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visit/(timestamp)/log/` """ # noqa return browse_snapshot_log(request, origin_type=origin_type, origin_url=origin_url, timestamp=timestamp) @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)' '/visit/(?P.+)/branches/', r'origin/(?P[a-z]+)/url/(?P.+)' '/branches/', r'origin/(?P.+)/visit/(?P.+)/branches/', r'origin/(?P.+)/branches/', view_name='browse-origin-branches') def origin_branches_browse(request, origin_url, origin_type=None, timestamp=None): """Django view that produces an HTML display of the list of branches associated to an origin for a given visit. The url scheme that points to it is the following: * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/branches/` * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visit/(timestamp)/branches/` """ # noqa return browse_snapshot_branches(request, origin_type=origin_type, origin_url=origin_url, timestamp=timestamp) @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)' '/visit/(?P.+)/releases/', r'origin/(?P[a-z]+)/url/(?P.+)' '/releases/', r'origin/(?P.+)/visit/(?P.+)/releases/', r'origin/(?P.+)/releases/', view_name='browse-origin-releases') def origin_releases_browse(request, origin_url, origin_type=None, timestamp=None): """Django view that produces an HTML display of the list of releases associated to an origin for a given visit. The url scheme that points to it is the following: * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/releases/` * :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visit/(timestamp)/releases/` """ # noqa return browse_snapshot_releases(request, origin_type=origin_type, origin_url=origin_url, timestamp=timestamp) @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)/visits/', r'origin/(?P.+)/visits/', view_name='browse-origin-visits') def origin_visits_browse(request, origin_url, origin_type=None): """Django view that produces an HTML display of visits reporting for a swh origin identified by its id or its url. The url that points to it is - :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visits/`. + :http:get:`/browse/origin/[(origin_type)/url/](origin_url)/visits/`. """ try: origin_info = get_origin_info(origin_url, origin_type) origin_visits = get_origin_visits(origin_info) snapshot_context = get_snapshot_context(origin_type=origin_type, origin_url=origin_url) except Exception as exc: return handle_view_exception(request, exc) for i, visit in enumerate(origin_visits): url_date = format_utc_iso_date(visit['date'], '%Y-%m-%dT%H:%M:%SZ') visit['fmt_date'] = format_utc_iso_date(visit['date']) query_params = {} if i < len(origin_visits) - 1: if visit['date'] == origin_visits[i+1]['date']: query_params = {'visit_id': visit['visit']} if i > 0: if visit['date'] == origin_visits[i-1]['date']: query_params = {'visit_id': visit['visit']} snapshot = visit['snapshot'] if visit['snapshot'] else '' visit['browse_url'] = reverse('browse-origin-directory', url_args={'origin_type': origin_type, 'origin_url': origin_url, 'timestamp': url_date}, query_params=query_params) if not snapshot: visit['snapshot'] = '' visit['date'] = parse_timestamp(visit['date']).timestamp() heading = 'Origin visits - %s' % origin_url return render(request, 'browse/origin-visits.html', {'heading': heading, 'swh_object_name': 'Visits', 'swh_object_metadata': origin_info, 'origin_visits': origin_visits, 'origin_info': origin_info, 'snapshot_context': snapshot_context, 'vault_cooking': None, 'show_actions_menu': False}) @browse_route(r'origin/search/(?P.+)/', view_name='browse-origin-search') def _origin_search(request, url_pattern): """Internal browse endpoint to search for origins whose urls contain a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. """ offset = int(request.GET.get('offset', '0')) limit = int(request.GET.get('limit', '50')) regexp = request.GET.get('regexp', 'false') with_visit = request.GET.get('with_visit', 'false') url_pattern = url_pattern.replace('///', '\\') try: results = service.search_origin(url_pattern, offset, limit, bool(strtobool(regexp)), bool(strtobool(with_visit))) results = json.dumps(list(results), sort_keys=True, indent=4, separators=(',', ': ')) except Exception as exc: return handle_view_exception(request, exc, html_response=False) return HttpResponse(results, content_type='application/json') @browse_route(r'origin/(?P[a-z]+)/url/(?P.+)/', r'origin/(?P.+)/', view_name='browse-origin') def origin_browse(request, origin_url, origin_type=None): """Django view that redirects to the display of the latest archived snapshot for a given software origin. """ last_snapshot_url = reverse('browse-origin-directory', url_args={'origin_type': origin_type, 'origin_url': origin_url}) return redirect(last_snapshot_url) diff --git a/swh/web/browse/views/snapshot.py b/swh/web/browse/views/snapshot.py index e40a5096..f42fe399 100644 --- a/swh/web/browse/views/snapshot.py +++ b/swh/web/browse/views/snapshot.py @@ -1,104 +1,104 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import redirect from swh.web.browse.browseurls import browse_route from swh.web.common.utils import reverse from .utils.snapshot_context import ( browse_snapshot_directory, browse_snapshot_content, browse_snapshot_log, browse_snapshot_branches, browse_snapshot_releases ) @browse_route(r'snapshot/(?P[0-9a-f]+)/', view_name='browse-snapshot', checksum_args=['snapshot_id']) def snapshot_browse(request, snapshot_id): """Django view for browsing the content of a snapshot. The url that points to it is :http:get:`/browse/snapshot/(snapshot_id)/` """ browse_snapshot_url = reverse('browse-snapshot-directory', url_args={'snapshot_id': snapshot_id}, query_params=request.GET) return redirect(browse_snapshot_url) @browse_route(r'snapshot/(?P[0-9a-f]+)/directory/', r'snapshot/(?P[0-9a-f]+)/directory/(?P.+)/', view_name='browse-snapshot-directory', checksum_args=['snapshot_id']) def snapshot_directory_browse(request, snapshot_id, path=None): """Django view for browsing the content of a directory collected in a snapshot. The url that points to it is - :http:get:`/browse/snapshot/(snapshot_id)/directory/[(path)/]` + :http:get:`/browse/snapshot/(snapshot_id)/directory/[(path)/]` """ origin_type = request.GET.get('origin_type', None) origin_url = request.GET.get('origin_url', None) if not origin_url: origin_url = request.GET.get('origin', None) return browse_snapshot_directory(request, snapshot_id=snapshot_id, path=path, origin_type=origin_type, origin_url=origin_url) @browse_route(r'snapshot/(?P[0-9a-f]+)/content/(?P.+)/', view_name='browse-snapshot-content', checksum_args=['snapshot_id']) def snapshot_content_browse(request, snapshot_id, path): """Django view that produces an HTML display of a content collected in a snapshot. The url that points to it is - :http:get:`/browse/snapshot/(snapshot_id)/content/(path)/` + :http:get:`/browse/snapshot/(snapshot_id)/content/(path)/` """ language = request.GET.get('language', None) return browse_snapshot_content(request, snapshot_id=snapshot_id, path=path, selected_language=language) @browse_route(r'snapshot/(?P[0-9a-f]+)/log/', view_name='browse-snapshot-log', checksum_args=['snapshot_id']) def snapshot_log_browse(request, snapshot_id): """Django view that produces an HTML display of revisions history (aka the commit log) collected in a snapshot. The url that points to it is - :http:get:`/browse/snapshot/(snapshot_id)/log/` + :http:get:`/browse/snapshot/(snapshot_id)/log/` """ return browse_snapshot_log(request, snapshot_id=snapshot_id) @browse_route(r'snapshot/(?P[0-9a-f]+)/branches/', view_name='browse-snapshot-branches', checksum_args=['snapshot_id']) def snapshot_branches_browse(request, snapshot_id): """Django view that produces an HTML display of the list of releases collected in a snapshot. The url that points to it is - :http:get:`/browse/snapshot/(snapshot_id)/branches/` + :http:get:`/browse/snapshot/(snapshot_id)/branches/` """ return browse_snapshot_branches(request, snapshot_id=snapshot_id) @browse_route(r'snapshot/(?P[0-9a-f]+)/releases/', view_name='browse-snapshot-releases', checksum_args=['snapshot_id']) def snapshot_releases_browse(request, snapshot_id): """Django view that produces an HTML display of the list of releases collected in a snapshot. The url that points to it is - :http:get:`/browse/snapshot/(snapshot_id)/releases/` + :http:get:`/browse/snapshot/(snapshot_id)/releases/` """ return browse_snapshot_releases(request, snapshot_id=snapshot_id) diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py index f0cbf86e..b6094e37 100644 --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -1,338 +1,338 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import docutils.parsers.rst import docutils.utils import re from datetime import datetime, timezone from dateutil import parser as date_parser from dateutil import tz from django.urls import reverse as django_reverse from django.http import QueryDict from rest_framework.authentication import SessionAuthentication from swh.model.exceptions import ValidationError from swh.model.identifiers import ( persistent_identifier, parse_persistent_identifier, CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT ) from swh.web.common.exc import BadInputExc swh_object_icons = { 'branch': 'fa fa-code-fork', 'branches': 'fa fa-code-fork', 'content': 'fa fa-file-text', 'directory': 'fa fa-folder', 'person': 'fa fa-user', 'revisions history': 'fa fa-history', 'release': 'fa fa-tag', 'releases': 'fa fa-tag', 'revision': 'octicon-git-commit', 'snapshot': 'fa fa-camera', 'visits': 'fa fa-calendar', } def reverse(viewname, url_args=None, query_params=None, current_app=None, urlconf=None): """An override of django reverse function supporting query parameters. Args: viewname (str): the name of the django view from which to compute a url url_args (dict): dictionary of url arguments indexed by their names query_params (dict): dictionary of query parameters to append to the reversed url current_app (str): the name of the django app tighten to the view urlconf (str): url configuration module Returns: str: the url of the requested view with processed arguments and query parameters """ if url_args: url_args = {k: v for k, v in url_args.items() if v is not None} url = django_reverse(viewname, urlconf=urlconf, kwargs=url_args, current_app=current_app) if query_params: query_params = {k: v for k, v in query_params.items() if v} if query_params and len(query_params) > 0: query_dict = QueryDict('', mutable=True) for k in sorted(query_params.keys()): query_dict[k] = query_params[k] url += ('?' + query_dict.urlencode(safe='/;:')) return url def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datetime: datetime in UTC without timezone info """ if date.tzinfo: return date.astimezone(tz.gettz('UTC')).replace(tzinfo=timezone.utc) else: return date def parse_timestamp(timestamp): """Given a time or timestamp (as string), parse the result as UTC datetime. Returns: datetime.datetime: a timezone-aware datetime representing the parsed value or None if the parsing fails. Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - Today is January 1, 2047 at 8:21:00AM - 1452591542 """ if not timestamp: return None try: date = date_parser.parse(timestamp, ignoretz=False, fuzzy=True) return datetime_to_utc(date) except Exception: try: return datetime.utcfromtimestamp(float(timestamp)).replace( tzinfo=timezone.utc) except (ValueError, OverflowError) as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r'([0-9a-f]{8})[0-9a-z]{56}' sha1_re = r'([0-9a-f]{8})[0-9a-f]{32}' ret = re.sub(sha256_re, r'\1...', path) return re.sub(sha1_re, r'\1...', ret) def format_utc_iso_date(iso_date, fmt='%d %B %Y, %H:%M UTC'): """Turns a string representation of an ISO 8601 date string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: str: a formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_timestamp(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: list: a list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip('/').split('/') path_from_root = '' for p in sub_paths: path_from_root += '/' + p path_info.append({'name': p, 'path': path_from_root.strip('/')}) return path_info def get_swh_persistent_id(object_type, object_id, scheme_version=1): """ Returns the persistent identifier for a swh object based on: * the object type * the object id * the swh identifiers scheme version Args: object_type (str): the swh object type (content/directory/release/revision/snapshot) object_id (str): the swh object id (hexadecimal representation of its hash value) scheme_version (int): the scheme version of the swh persistent identifiers Returns: str: the swh object persistent identifier Raises: BadInputExc: if the provided parameters do not enable to generate a valid identifier """ try: swh_id = persistent_identifier(object_type, object_id, scheme_version) except ValidationError as e: raise BadInputExc('Invalid object (%s) for swh persistent id. %s' % (object_id, e)) else: return swh_id def resolve_swh_persistent_id(swh_id, query_params=None): """ Try to resolve a Software Heritage persistent id into an url for browsing the pointed object. Args: swh_id (str): a Software Heritage persistent identifier query_params (django.http.QueryDict): optional dict filled with query parameters to append to the browse url Returns: dict: a dict with the following keys: * **swh_id_parsed (swh.model.identifiers.PersistentId)**: - the parsed identifier + the parsed identifier * **browse_url (str)**: the url for browsing the pointed object Raises: BadInputExc: if the provided identifier can not be parsed """ try: swh_id_parsed = parse_persistent_identifier(swh_id) object_type = swh_id_parsed.object_type object_id = swh_id_parsed.object_id browse_url = None query_dict = QueryDict('', mutable=True) if query_params and len(query_params) > 0: for k in sorted(query_params.keys()): query_dict[k] = query_params[k] if 'origin' in swh_id_parsed.metadata: query_dict['origin'] = swh_id_parsed.metadata['origin'] if object_type == CONTENT: query_string = 'sha1_git:' + object_id fragment = '' if 'lines' in swh_id_parsed.metadata: lines = swh_id_parsed.metadata['lines'].split('-') fragment += '#L' + lines[0] if len(lines) > 1: fragment += '-L' + lines[1] browse_url = reverse('browse-content', url_args={'query_string': query_string}, query_params=query_dict) + fragment elif object_type == DIRECTORY: browse_url = reverse('browse-directory', url_args={'sha1_git': object_id}, query_params=query_dict) elif object_type == RELEASE: browse_url = reverse('browse-release', url_args={'sha1_git': object_id}, query_params=query_dict) elif object_type == REVISION: browse_url = reverse('browse-revision', url_args={'sha1_git': object_id}, query_params=query_dict) elif object_type == SNAPSHOT: browse_url = reverse('browse-snapshot', url_args={'snapshot_id': object_id}, query_params=query_dict) except ValidationError as ve: raise BadInputExc('Error when parsing identifier. %s' % ' '.join(ve.messages)) else: return {'swh_id_parsed': swh_id_parsed, 'browse_url': browse_url} def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components).get_default_values() settings.report_level = report_level document = docutils.utils.new_document('rst-doc', settings=settings) parser.parse(text, document) return document def get_client_ip(request): """ Return the client IP address from an incoming HTTP request. Args: request (django.http.HttpRequest): the incoming HTTP request Returns: str: The client IP address """ x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR') if x_forwarded_for: ip = x_forwarded_for.split(',')[0] else: ip = request.META.get('REMOTE_ADDR') return ip def context_processor(request): """ Django context processor used to inject variables in all swh-web templates. """ return {'swh_object_icons': swh_object_icons, 'available_languages': None} class EnforceCSRFAuthentication(SessionAuthentication): """ Helper class to enforce CSRF validation on a DRF view when a user is not authenticated. """ def authenticate(self, request): user = getattr(request._request, 'user', None) self.enforce_csrf(request) return (user, None)