diff --git a/swh/core/github/__init__.py b/swh/core/github/__init__.py new file mode 100644 diff --git a/swh/core/github/utils.py b/swh/core/github/utils.py new file mode 100644 --- /dev/null +++ b/swh/core/github/utils.py @@ -0,0 +1,48 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import re +from typing import Optional + +import requests + +GITHUB_PATTERN = re.compile(r"https?://github.com/(?P.*)") + + +def _url_github_html(user_repo: str) -> str: + """Given the user repo, returns the expected github html url.""" + return f"https://github.com/{user_repo}" + + +def _url_github_api(user_repo: str) -> str: + """Given the user_repo, returns the expected github api url.""" + return f"https://api.github.com/repos/{user_repo}" + + +def _sanitize_github_url(url: str) -> str: + """Sanitize github url.""" + return url.lower().rstrip("/").rstrip(".git").rstrip("/") + + +def get_canonical_github_origin_url(url: str) -> Optional[str]: + """Retrieve canonical github url out of an url if any or None otherwise. + + This triggers an anonymous http request to the github api url to determine the + canonicalized repository url. + + """ + url_ = url.lower() + + match = GITHUB_PATTERN.match(url_) + if not match: + return url + + user_repo = _sanitize_github_url(match.groupdict()["user_repo"]) + response = requests.get(_url_github_api(user_repo)) + if response.status_code != 200: + return None + data = response.json() + return data["html_url"] diff --git a/swh/core/tests/test_github_utils.py b/swh/core/tests/test_github_utils.py new file mode 100644 --- /dev/null +++ b/swh/core/tests/test_github_utils.py @@ -0,0 +1,51 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.core.github.utils import ( + _sanitize_github_url, + _url_github_api, + _url_github_html, + get_canonical_github_origin_url, +) + +KNOWN_GH_REPO = "https://github.com/user/repo" + + +@pytest.mark.parametrize( + "user_repo, expected_url", + [ + ("user/repo.git", KNOWN_GH_REPO), + ("user/repo.git/", KNOWN_GH_REPO), + ("user/repo/", KNOWN_GH_REPO), + ("user/repo", KNOWN_GH_REPO), + ("user/repo/.git", KNOWN_GH_REPO), + # edge cases + ("https://github.com/unknown-page", None), # unknown gh origin returns None + ("user/repo/with/some/deps", None), # url kind is not dealt with for now + ], +) +def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock): + """It should return a canonicalized github origin when it exists, None otherwise""" + html_url = _url_github_html(user_repo) + api_url = _url_github_api(_sanitize_github_url(user_repo)) + + if expected_url is not None: + status_code = 200 + response = {"html_url": _sanitize_github_url(html_url)} + else: + status_code = 404 + response = {} + + requests_mock.get(api_url, [{"status_code": status_code, "json": response}]) + + assert get_canonical_github_origin_url(html_url) == expected_url + + +def test_get_canonical_github_origin_url_not_gh_origin(): + """It should return the input url when not a github origin as input""" + url = "https://example.org" + assert get_canonical_github_origin_url(url) == url