diff --git a/debian/control b/debian/control index 70c599b3..eeef307a 100644 --- a/debian/control +++ b/debian/control @@ -1,42 +1,42 @@ Source: swh-web Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: curl, debhelper (>= 9), dh-python (>= 2), python3-all, python3-bs4, python3-django (>= 1.10.7~), python3-djangorestframework (>= 3.4.0~), python3-django-webpack-loader, python3-django-js-reverse, python3-docutils, python3-htmlmin, python3-magic (>= 0.3.0~), python3-lxml, python3-nose, python3-pygments, python3-pypandoc, python3-setuptools, python3-sphinx, python3-sphinxcontrib.httpdomain, python3-yaml, python3-swh.core (>= 0.0.40~), - python3-swh.model (>= 0.0.24~), + python3-swh.model (>= 0.0.25~), python3-swh.storage (>= 0.0.101~), python3-swh.indexer.storage (>= 0.0.52~), python3-swh.vault (>= 0.0.20~) Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DWUI/ Package: python3-swh.web Architecture: all Depends: python3-swh.core (>= 0.0.40~), - python3-swh.model (>= 0.0.24~), + python3-swh.model (>= 0.0.25~), python3-swh.storage (>= 0.0.101~), python3-swh.indexer.storage (>= 0.0.52~), python3-swh.vault (>= 0.0.20~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Web Applications diff --git a/docs/uri-scheme-api-identifiers.rst b/docs/uri-scheme-api-identifiers.rst new file mode 100644 index 00000000..4d8c40a6 --- /dev/null +++ b/docs/uri-scheme-api-identifiers.rst @@ -0,0 +1,4 @@ +Persistent identifiers +---------------------- + +.. autosimple:: swh.web.api.views.identifiers.api_resolve_swh_pid diff --git a/docs/uri-scheme-api.rst b/docs/uri-scheme-api.rst index 6d0dde20..b059c15e 100644 --- a/docs/uri-scheme-api.rst +++ b/docs/uri-scheme-api.rst @@ -1,22 +1,24 @@ .. _swh-web-api-urls: SWH Web API URLs ================ .. include:: uri-scheme-api-content.rst .. include:: uri-scheme-api-directory.rst +.. include:: uri-scheme-api-identifiers.rst + .. include:: uri-scheme-api-origin.rst .. include:: uri-scheme-api-person.rst .. include:: uri-scheme-api-release.rst .. include:: uri-scheme-api-revision.rst .. include:: uri-scheme-api-snapshot.rst .. include:: uri-scheme-api-stat.rst .. include:: uri-scheme-api-vault.rst diff --git a/requirements-swh.txt b/requirements-swh.txt index 4be0e509..b0a32103 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.40 -swh.model >= 0.0.24 +swh.model >= 0.0.25 swh.storage >= 0.0.101 swh.vault >= 0.0.20 swh.indexer >= 0.0.52 diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py index de2e0101..6f325ed8 100644 --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -1,19 +1,20 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import swh.web.api.views.content # noqa import swh.web.api.views.directory # noqa import swh.web.api.views.entity # noqa +import swh.web.api.views.identifiers # noqa import swh.web.api.views.origin # noqa import swh.web.api.views.person # noqa import swh.web.api.views.release # noqa import swh.web.api.views.revision # noqa import swh.web.api.views.snapshot # noqa import swh.web.api.views.stat # noqa import swh.web.api.views.vault # noqa from swh.web.api.apiurls import APIUrls urlpatterns = APIUrls.get_url_patterns() diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py new file mode 100644 index 00000000..7088fe32 --- /dev/null +++ b/swh/web/api/views/identifiers.py @@ -0,0 +1,77 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.model.identifiers import ( + CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT +) + +from swh.web.common import service +from swh.web.common.utils import resolve_swh_persistent_id +from swh.web.api.apidoc import api_doc +from swh.web.api.apiurls import api_route + + +@api_route(r'/resolve/(?P.*)/', + 'resolve-swh-pid') +@api_doc('/resolve/') +def api_resolve_swh_pid(request, swh_id): + """ + .. http:get:: /api/1/resolve/(swh_id)/ + + Resolve a Software Heritage persistent identifier. + + Try to resolve a provided `persistent identifier `_ + into an url for browsing the pointed archive object. If the provided + identifier is valid, the existence of the object in the archive + will also be checked. + + :param string swh_id: a SWH presistent identifier + + :>json string browse_url: the url for browsing the pointed object + :>json object metadata: object holding optional parts of the persistent identifier + :>json string namespace: the persistent identifier namespace + :>json string object_id: the hash identifier of the pointed object + :>json string object_type: the type of the pointed object + :>json number scheme_version: the scheme version of the persistent identifier + + :reqheader Accept: the requested response content type, + either *application/json* (default) or *application/yaml* + :resheader Content-Type: this depends on :http:header:`Accept` header of request + + **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` + + :statuscode 200: no error + :statuscode 400: an invalid persistent identifier has been provided + :statuscode 404: the pointed object does not exist in the archive + + **Example:** + + .. parsed-literal:: + + :swh_web_api:`resolve/swh:1:rev:96db9023b881d7cd9f379b0c154650d6c108e9a3;origin=https://github.com/openssl/openssl/` + """ # noqa + # try to resolve the provided pid + swh_id_resolved = resolve_swh_persistent_id(swh_id) + # id is well-formed, now check that the pointed + # object is present in the archive, NotFoundExc + # will be raised otherwise + swh_id_parsed = swh_id_resolved['swh_id_parsed'] + object_type = swh_id_parsed.object_type + object_id = swh_id_parsed.object_id + if object_type == CONTENT: + service.lookup_content('sha1_git:%s' % object_id) + elif object_type == DIRECTORY: + service.lookup_directory(object_id) + elif object_type == RELEASE: + service.lookup_release(object_id) + elif object_type == REVISION: + service.lookup_revision(object_id) + elif object_type == SNAPSHOT: + service.lookup_snapshot(object_id) + # id is well-formed and the pointed object exists + swh_id_data = swh_id_parsed._asdict() + swh_id_data['browse_url'] = swh_id_resolved['browse_url'] + return swh_id_data diff --git a/swh/web/browse/identifiers.py b/swh/web/browse/identifiers.py index 45709bba..22fd98b1 100644 --- a/swh/web/browse/identifiers.py +++ b/swh/web/browse/identifiers.py @@ -1,62 +1,25 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import redirect -from swh.model.identifiers import parse_persistent_identifier - -from swh.web.common.utils import reverse -from swh.web.common.exc import BadInputExc, handle_view_exception +from swh.web.common.utils import resolve_swh_persistent_id +from swh.web.common.exc import handle_view_exception def swh_id_browse(request, swh_id): """ Django view enabling to browse the SWH archive using :ref:`persistent-identifiers`. The url that points to it is :http:get:`/(swh_id)/`. """ try: - swh_id_parsed = parse_persistent_identifier(swh_id) - object_type = swh_id_parsed['object_type'] - object_id = swh_id_parsed['object_id'] - view_url = None - query_params = request.GET.copy() - if 'origin' in swh_id_parsed['metadata']: - query_params['origin'] = swh_id_parsed['metadata']['origin'] - if object_type == 'cnt': - query_string = 'sha1_git:' + object_id - fragment = '' - if 'lines' in swh_id_parsed['metadata']: - lines = swh_id_parsed['metadata']['lines'].split('-') - fragment += '#L' + lines[0] - if len(lines) > 1: - fragment += '-L' + lines[1] - view_url = reverse('browse-content', - kwargs={'query_string': query_string}, - query_params=query_params) + fragment - elif object_type == 'dir': - view_url = reverse('browse-directory', - kwargs={'sha1_git': object_id}, - query_params=query_params) - elif object_type == 'rel': - view_url = reverse('browse-release', - kwargs={'sha1_git': object_id}, - query_params=query_params) - elif object_type == 'rev': - view_url = reverse('browse-revision', - kwargs={'sha1_git': object_id}, - query_params=query_params) - elif object_type == 'snp': - view_url = reverse('browse-snapshot', - kwargs={'snapshot_id': object_id}, - query_params=query_params) - else: - msg = '\'%s\' is not a valid SWH persistent identifier!' % swh_id - raise BadInputExc(msg) + swh_id_resolved = resolve_swh_persistent_id( + swh_id, query_params=request.GET) except Exception as exc: return handle_view_exception(request, exc) - return redirect(view_url) + return redirect(swh_id_resolved['browse_url']) diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py index 92756cf9..ad6b29d8 100644 --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -1,308 +1,374 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import docutils.parsers.rst import docutils.utils import re from datetime import datetime, timezone from dateutil import parser as date_parser from dateutil import tz from django.core.cache import cache from django.core import urlresolvers from django.http import QueryDict from swh.model.exceptions import ValidationError -from swh.model.identifiers import persistent_identifier +from swh.model.identifiers import ( + persistent_identifier, parse_persistent_identifier, + CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT +) from swh.web.common import service from swh.web.common.exc import BadInputExc def reverse(viewname, args=None, kwargs=None, query_params=None, current_app=None, urlconf=None): """An override of django reverse function supporting query parameters. Args: viewname: the name of the django view from which to compute a url args: list of url arguments ordered according to their position it kwargs: dictionary of url arguments indexed by their names query_params: dictionary of query parameters to append to the reversed url current_app: the name of the django app tighted to the view urlconf: url configuration module Returns: The url of the requested view with processed arguments and query parameters """ if kwargs: kwargs = {k: v for k, v in kwargs.items() if v is not None} url = urlresolvers.reverse( viewname, urlconf=urlconf, args=args, kwargs=kwargs, current_app=current_app) if query_params: query_params = {k: v for k, v in query_params.items() if v is not None} if query_params and len(query_params) > 0: query_dict = QueryDict('', mutable=True) for k in sorted(query_params.keys()): query_dict[k] = query_params[k] url += ('?' + query_dict.urlencode(safe='/')) return url def fmap(f, data): """Map f to data at each level. This must keep the origin data structure type: - map -> map - dict -> dict - list -> list - None -> None Args: f: function that expects one argument. data: data to traverse to apply the f function. list, map, dict or bare value. Returns: The same data-structure with modified values by the f function. """ if data is None: return data if isinstance(data, map): return map(lambda y: fmap(f, y), (x for x in data)) if isinstance(data, list): return [fmap(f, x) for x in data] if isinstance(data, dict): return {k: fmap(f, v) for (k, v) in data.items()} return f(data) def datetime_to_utc(date): """Returns datetime in UTC without timezone info Args: date (datetime.datetime): input datetime with timezone info Returns: datetime.datime: datetime in UTC without timezone info """ if date.tzinfo: return date.astimezone(tz.gettz('UTC')).replace(tzinfo=timezone.utc) else: return date def parse_timestamp(timestamp): """Given a time or timestamp (as string), parse the result as UTC datetime. Returns: a timezone-aware datetime representing the parsed value. None if the parsing fails. Samples: - 2016-01-12 - 2016-01-12T09:19:12+0100 - Today is January 1, 2047 at 8:21:00AM - 1452591542 """ if not timestamp: return None try: date = date_parser.parse(timestamp, ignoretz=False, fuzzy=True) return datetime_to_utc(date) except Exception: try: return datetime.utcfromtimestamp(float(timestamp)).replace( tzinfo=timezone.utc) except (ValueError, OverflowError) as e: raise BadInputExc(e) def shorten_path(path): """Shorten the given path: for each hash present, only return the first 8 characters followed by an ellipsis""" sha256_re = r'([0-9a-f]{8})[0-9a-z]{56}' sha1_re = r'([0-9a-f]{8})[0-9a-f]{32}' ret = re.sub(sha256_re, r'\1...', path) return re.sub(sha1_re, r'\1...', ret) def format_utc_iso_date(iso_date, fmt='%d %B %Y, %H:%M UTC'): """Turns a string reprensation of an ISO 8601 date string to UTC and format it into a more human readable one. For instance, from the following input string: '2017-05-04T13:27:13+02:00' the following one is returned: '04 May 2017, 11:27 UTC'. Custom format string may also be provided as parameter Args: iso_date (str): a string representation of an ISO 8601 date fmt (str): optional date formatting string Returns: A formatted string representation of the input iso date """ if not iso_date: return iso_date date = parse_timestamp(iso_date) return date.strftime(fmt) def gen_path_info(path): """Function to generate path data navigation for use with a breadcrumb in the swh web ui. For instance, from a path /folder1/folder2/folder3, it returns the following list:: [{'name': 'folder1', 'path': 'folder1'}, {'name': 'folder2', 'path': 'folder1/folder2'}, {'name': 'folder3', 'path': 'folder1/folder2/folder3'}] Args: path: a filesystem path Returns: A list of path data for navigation as illustrated above. """ path_info = [] if path: sub_paths = path.strip('/').split('/') path_from_root = '' for p in sub_paths: path_from_root += '/' + p path_info.append({'name': p, 'path': path_from_root.strip('/')}) return path_info def get_origin_visits(origin_info): """Function that returns the list of visits for a swh origin. That list is put in cache in order to speedup the navigation in the swh web browse ui. Args: origin_id (int): the id of the swh origin to fetch visits from Returns: A list of dict describing the origin visits:: [{'date': , 'origin': , 'status': <'full' | 'partial'>, 'visit': }, ... ] Raises: NotFoundExc if the origin is not found """ cache_entry_id = 'origin_%s_visits' % origin_info['id'] cache_entry = cache.get(cache_entry_id) if cache_entry: return cache_entry origin_visits = [] per_page = service.MAX_LIMIT last_visit = None while 1: visits = list(service.lookup_origin_visits(origin_info['id'], last_visit=last_visit, per_page=per_page)) origin_visits += visits if len(visits) < per_page: break else: if not last_visit: last_visit = per_page else: last_visit += per_page def _visit_sort_key(visit): ts = parse_timestamp(visit['date']).timestamp() return ts + (float(visit['visit']) / 10e3) for v in origin_visits: if 'metadata' in v: del v['metadata'] origin_visits = [dict(t) for t in set([tuple(d.items()) for d in origin_visits])] origin_visits = sorted(origin_visits, key=lambda v: _visit_sort_key(v)) cache.set(cache_entry_id, origin_visits) return origin_visits def get_swh_persistent_id(object_type, object_id, scheme_version=1): """ Returns the persistent identifier for a swh object based on: * the object type * the object id * the swh identifiers scheme version Args: object_type (str): the swh object type (content/directory/release/revision/snapshot) object_id (str): the swh object id (hexadecimal representation of its hash value) scheme_version (int): the scheme version of the swh persistent identifiers Returns: str: the swh object persistent identifier Raises: BadInputExc if the provided parameters do not enable to generate a valid identifier """ try: swh_id = persistent_identifier(object_type, object_id, scheme_version) except ValidationError as e: raise BadInputExc('Invalid object (%s) for swh persistent id. %s' % (object_id, e)) else: return swh_id +def resolve_swh_persistent_id(swh_id, query_params=None): + """ + Try to resolve a SWH persistent id into an url for + browsing the pointed object. + + Args: + swh_id (str): a SWH persistent identifier + query_params (django.http.QueryDict): optional dict filled with + query parameters to append to the browse url + + Returns: + dict: a dict with the following keys: + + * **swh_id_parsed (swh.model.identifiers.PersistentId)**: the parsed identifier + * **browse_url (str)**: the url for browsing the pointed object + + Raises: + BadInputExc: if the provided identifier can not be parsed + """ # noqa + try: + swh_id_parsed = parse_persistent_identifier(swh_id) + object_type = swh_id_parsed.object_type + object_id = swh_id_parsed.object_id + browse_url = None + if not query_params: + query_params = QueryDict('', mutable=True) + if 'origin' in swh_id_parsed.metadata: + query_params['origin'] = swh_id_parsed.metadata['origin'] + if object_type == CONTENT: + query_string = 'sha1_git:' + object_id + fragment = '' + if 'lines' in swh_id_parsed.metadata: + lines = swh_id_parsed.metadata['lines'].split('-') + fragment += '#L' + lines[0] + if len(lines) > 1: + fragment += '-L' + lines[1] + browse_url = reverse('browse-content', + kwargs={'query_string': query_string}, + query_params=query_params) + fragment + elif object_type == DIRECTORY: + browse_url = reverse('browse-directory', + kwargs={'sha1_git': object_id}, + query_params=query_params) + elif object_type == RELEASE: + browse_url = reverse('browse-release', + kwargs={'sha1_git': object_id}, + query_params=query_params) + elif object_type == REVISION: + browse_url = reverse('browse-revision', + kwargs={'sha1_git': object_id}, + query_params=query_params) + elif object_type == SNAPSHOT: + browse_url = reverse('browse-snapshot', + kwargs={'snapshot_id': object_id}, + query_params=query_params) + except ValidationError as ve: + raise BadInputExc('Error when parsing identifier. %s' % + ' '.join(ve.messages)) + else: + return {'swh_id_parsed': swh_id_parsed, + 'browse_url': browse_url} + + def parse_rst(text, report_level=2): """ Parse a reStructuredText string with docutils. Args: text (str): string with reStructuredText markups in it report_level (int): level of docutils report messages to print (1 info 2 warning 3 error 4 severe 5 none) Returns: docutils.nodes.document: a parsed docutils document """ parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) settings = docutils.frontend.OptionParser( components=components).get_default_values() settings.report_level = report_level document = docutils.utils.new_document('rst-doc', settings=settings) parser.parse(text, document) return document diff --git a/swh/web/tests/api/views/test_identifiers.py b/swh/web/tests/api/views/test_identifiers.py new file mode 100644 index 00000000..f5cfb513 --- /dev/null +++ b/swh/web/tests/api/views/test_identifiers.py @@ -0,0 +1,69 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from nose.tools import istest +from rest_framework.test import APITestCase +from unittest.mock import patch + +from swh.model.identifiers import REVISION + +from swh.web.common.utils import reverse +from swh.web.common.exc import NotFoundExc +from swh.web.tests.testbase import SWHWebTestBase + + +class SwhIdsApiTestCase(SWHWebTestBase, APITestCase): + + @istest + @patch('swh.web.api.views.identifiers.service') + def swh_id_resolve_success(self, mock_service): + rev_id = '96db9023b881d7cd9f379b0c154650d6c108e9a3' + origin = 'https://github.com/openssl/openssl' + swh_id = 'swh:1:rev:%s;origin=%s' % (rev_id, origin) + url = reverse('resolve-swh-pid', kwargs={'swh_id': swh_id}) + + mock_service.lookup_revision.return_value = {} + + resp = self.client.get(url) + + browse_rev_url = reverse('browse-revision', + kwargs={'sha1_git': rev_id}, + query_params={'origin': origin}) + + expected_result = { + 'browse_url': browse_rev_url, + 'metadata': {'origin': origin}, + 'namespace': 'swh', + 'object_id': rev_id, + 'object_type': REVISION, + 'scheme_version': 1 + } + + self.assertEquals(resp.status_code, 200) + self.assertEquals(resp.data, expected_result) + + @istest + def swh_id_resolve_invalid(self): + rev_id_invalid = '96db9023b8_foo_50d6c108e9a3' + swh_id = 'swh:1:rev:%s' % rev_id_invalid + url = reverse('resolve-swh-pid', kwargs={'swh_id': swh_id}) + + resp = self.client.get(url) + + self.assertEquals(resp.status_code, 400) + + @istest + @patch('swh.web.api.views.identifiers.service') + def swh_id_resolve_not_found(self, mock_service): + rev_id_not_found = '56db90232881d7cd9e379b0c154650d6c108e9a1' + + swh_id = 'swh:1:rev:%s' % rev_id_not_found + url = reverse('resolve-swh-pid', kwargs={'swh_id': swh_id}) + + mock_service.lookup_revision.side_effect = NotFoundExc('Revision not found !') # noqa + + resp = self.client.get(url) + + self.assertEquals(resp.status_code, 404)