diff --git a/swh/core/tests/test_utils.py b/swh/core/tests/test_utils.py --- a/swh/core/tests/test_utils.py +++ b/swh/core/tests/test_utils.py @@ -167,3 +167,40 @@ def test_basename_sotkey(): assert utils.basename_sortkey("00-xxx.sql") == (0, "-xxx.sql") assert utils.basename_sortkey("path/to/00-xxx.sql") == (0, "-xxx.sql") + + +KNOWN_GH_REPO = "https://github.com/user/repo" + + +@pytest.mark.parametrize( + "user_repo, expected_url", + [ + ("user/repo.git", KNOWN_GH_REPO), + ("user/repo.git/", KNOWN_GH_REPO), + ("user/repo/", KNOWN_GH_REPO), + ("user/repo", KNOWN_GH_REPO), + # edge cases + ("https://github.com/unknown-page", None), # unknown gh origin returns None + ], +) +def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock): + """It should return a canonicalized github origin when it exists, None otherwise""" + html_url = utils._url_github_html(user_repo) + api_url = utils._url_github_api(utils._sanitize_github_url(user_repo)) + + if expected_url is not None: + status_code = 200 + response = {"html_url": utils._sanitize_github_url(html_url)} + else: + status_code = 404 + response = {} + + requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) + + assert utils.get_canonical_github_origin_url(html_url) == expected_url + + +def test_get_canonical_github_origin_url_not_gh_origin(): + """It should return the input url when not a github origin as input""" + url = "https://example.org" + assert utils.get_canonical_github_origin_url(url) == url diff --git a/swh/core/utils.py b/swh/core/utils.py --- a/swh/core/utils.py +++ b/swh/core/utils.py @@ -8,7 +8,9 @@ import itertools import os import re -from typing import Iterable, Tuple, TypeVar +from typing import Iterable, Optional, Tuple, TypeVar + +import requests @contextmanager @@ -187,3 +189,45 @@ def basename_sortkey(fname: str) -> Tuple[int, str]: "like numfile_sortkey but on basenames" return numfile_sortkey(os.path.basename(fname)) + + +GITHUB_PATTERN = re.compile(r"https?://github.com/(?P.*)") + + +def _url_github_html(user_repo: str) -> str: + """Given the user repo, returns the expected github html url.""" + return f"https://github.com/{user_repo}" + + +def _url_github_api(user_repo: str) -> str: + """Given the user_repo, returns the expected github api url.""" + return f"https://api.github.com/repos/{user_repo}" + + +def _sanitize_github_url(url: str) -> str: + """Simple github sanitization url.""" + url_ = url.lower() + if url_.endswith("/"): + url_ = url_[:-1] + + if url_.endswith(".git"): + url_ = url_[:-4] + + return url_ + + +def get_canonical_github_origin_url(url: str) -> Optional[str]: + """Retrieve canonical github url out of an url.""" + + url_ = url.lower() + + match = GITHUB_PATTERN.match(url_) + if not match: + return url + + user_repo = _sanitize_github_url(match.groupdict()["user_repo"]) + response = requests.get(_url_github_api(user_repo)) + if response.status_code != 200: + return None + data = response.json() + return data["html_url"]