diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -8,6 +8,7 @@ import os import re from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union +from urllib.parse import urlparse from swh.model import hashutil from swh.model.identifiers import CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT @@ -248,6 +249,22 @@ # slash while the url in storage have it (e.g. Debian source package) else: origin_urls.append(f"{origin['url']}/") + try: + # handle case where the "://" character sequence was mangled into ":/" + parsed_url = urlparse(origin["url"]) + if ( + parsed_url.scheme + and not parsed_url.netloc + and f"{parsed_url.scheme}:/" in origin["url"] + and f"{parsed_url.scheme}://" not in origin["url"] + ): + origin_urls.append( + origin["url"].replace( + f"{parsed_url.scheme}:/", f"{parsed_url.scheme}://" + ) + ) + except Exception: + pass origins = [o for o in storage.origin_get(origin_urls) if o is not None] if not origins: msg = "Origin with url %s not found!" % origin["url"] diff --git a/swh/web/common/identifiers.py b/swh/web/common/identifiers.py --- a/swh/web/common/identifiers.py +++ b/swh/web/common/identifiers.py @@ -116,7 +116,9 @@ query_dict[k] = query_params[k] if "origin" in swhid_parsed.metadata: - query_dict["origin_url"] = unquote(swhid_parsed.metadata["origin"]) + origin_url = unquote(swhid_parsed.metadata["origin"]) + origin_url = archive.lookup_origin({"url": origin_url})["url"] + query_dict["origin_url"] = origin_url if "anchor" in swhid_parsed.metadata: anchor_swhid_parsed = get_swhid(swhid_parsed.metadata["anchor"]) diff --git a/swh/web/tests/browse/views/test_identifiers.py b/swh/web/tests/browse/views/test_identifiers.py --- a/swh/web/tests/browse/views/test_identifiers.py +++ b/swh/web/tests/browse/views/test_identifiers.py @@ -9,6 +9,7 @@ from hypothesis import given from swh.model.identifiers import CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT +from swh.model.model import Origin from swh.web.common.identifiers import gen_swhid from swh.web.common.utils import reverse from swh.web.tests.django_asserts import assert_contains @@ -123,9 +124,12 @@ @given(content()) -def test_content_id_optional_parts_browse(client, content): +def test_content_id_optional_parts_browse(client, archive_data, content): cnt_sha1_git = content["sha1_git"] origin_url = "https://github.com/user/repo" + + archive_data.origin_add([Origin(url=origin_url)]) + swhid = gen_swhid( CONTENT, cnt_sha1_git, metadata={"lines": "4-20", "origin": origin_url}, ) @@ -187,8 +191,9 @@ @given(directory()) -def test_browse_swhid_special_characters_escaping(client, directory): +def test_browse_swhid_special_characters_escaping(client, archive_data, directory): origin = "http://example.org/?project=abc;" + archive_data.origin_add([Origin(url=origin)]) origin_swhid_escaped = quote(origin, safe="/?:@&") origin_swhid_url_escaped = quote(origin, safe="/:@;") swhid = gen_swhid(DIRECTORY, directory, metadata={"origin": origin_swhid_escaped}) diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py --- a/swh/web/tests/common/test_archive.py +++ b/swh/web/tests/common/test_archive.py @@ -975,6 +975,14 @@ assert origin_info["url"] == deb_origin.url +def test_lookup_origin_single_slash_after_protocol(archive_data): + origin_url = "http://snapshot.debian.org/package/r-base/" + malformed_origin_url = "http:/snapshot.debian.org/package/r-base/" + archive_data.origin_add([Origin(url=origin_url)]) + origin_info = archive.lookup_origin({"url": malformed_origin_url}) + assert origin_info["url"] == origin_url + + @given(snapshot()) def test_lookup_snapshot_branch_name_from_tip_revision(archive_data, snapshot_id): snapshot = archive_data.snapshot_get(snapshot_id) diff --git a/swh/web/tests/common/test_identifiers.py b/swh/web/tests/common/test_identifiers.py --- a/swh/web/tests/common/test_identifiers.py +++ b/swh/web/tests/common/test_identifiers.py @@ -19,6 +19,7 @@ SWHID, parse_swhid, ) +from swh.model.model import Origin from swh.web.browse.snapshot_context import get_snapshot_context from swh.web.common.exc import BadInputExc from swh.web.common.identifiers import ( @@ -637,3 +638,13 @@ query_params={"path": dir_subdir_path}, ) assert resolved_swhid["browse_url"] == browse_url + + +@given(directory()) +def test_resolve_swhid_with_malformed_origin_url(archive_data, directory): + origin_url = "http://example.org/project/abc" + malformed_origin_url = "http:/example.org/project/abc" + archive_data.origin_add([Origin(url=origin_url)]) + swhid = gen_swhid(DIRECTORY, directory, metadata={"origin": malformed_origin_url}) + resolved_swhid = resolve_swhid(swhid) + assert origin_url in resolved_swhid["browse_url"]