Page MenuHomeSoftware Heritage

D7836.diff
No OneTemporary

D7836.diff

diff --git a/swh/core/github/__init__.py b/swh/core/github/__init__.py
new file mode 100644
diff --git a/swh/core/github/utils.py b/swh/core/github/utils.py
new file mode 100644
--- /dev/null
+++ b/swh/core/github/utils.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+import re
+from typing import Optional
+
+import requests
+
+GITHUB_PATTERN = re.compile(r"https?://github.com/(?P<user_repo>.*)")
+
+
+def _url_github_html(user_repo: str) -> str:
+ """Given the user repo, returns the expected github html url."""
+ return f"https://github.com/{user_repo}"
+
+
+def _url_github_api(user_repo: str) -> str:
+ """Given the user_repo, returns the expected github api url."""
+ return f"https://api.github.com/repos/{user_repo}"
+
+
+def _sanitize_github_url(url: str) -> str:
+ """Sanitize github url."""
+ return url.lower().rstrip("/").rstrip(".git").rstrip("/")
+
+
+def get_canonical_github_origin_url(url: str) -> Optional[str]:
+ """Retrieve canonical github url out of an url if any or None otherwise.
+
+ This triggers an anonymous http request to the github api url to determine the
+ canonical repository url.
+
+ """
+ url_ = url.lower()
+
+ match = GITHUB_PATTERN.match(url_)
+ if not match:
+ return url
+
+ user_repo = _sanitize_github_url(match.groupdict()["user_repo"])
+ response = requests.get(_url_github_api(user_repo))
+ if response.status_code != 200:
+ return None
+ data = response.json()
+ return data["html_url"]
diff --git a/swh/core/tests/test_github_utils.py b/swh/core/tests/test_github_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/core/tests/test_github_utils.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.core.github.utils import (
+ _sanitize_github_url,
+ _url_github_api,
+ _url_github_html,
+ get_canonical_github_origin_url,
+)
+
+KNOWN_GH_REPO = "https://github.com/user/repo"
+
+
+@pytest.mark.parametrize(
+ "user_repo, expected_url",
+ [
+ ("user/repo.git", KNOWN_GH_REPO),
+ ("user/repo.git/", KNOWN_GH_REPO),
+ ("user/repo/", KNOWN_GH_REPO),
+ ("user/repo", KNOWN_GH_REPO),
+ ("user/repo/.git", KNOWN_GH_REPO),
+ # edge cases
+ ("https://github.com/unknown-page", None), # unknown gh origin returns None
+ ("user/repo/with/some/deps", None), # url kind is not dealt with for now
+ ],
+)
+def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock):
+ """It should return a canonical github origin when it exists, None otherwise"""
+ html_url = _url_github_html(user_repo)
+ api_url = _url_github_api(_sanitize_github_url(user_repo))
+
+ if expected_url is not None:
+ status_code = 200
+ response = {"html_url": _sanitize_github_url(html_url)}
+ else:
+ status_code = 404
+ response = {}
+
+ requests_mock.get(api_url, [{"status_code": status_code, "json": response}])
+
+ assert get_canonical_github_origin_url(html_url) == expected_url
+
+
+def test_get_canonical_github_origin_url_not_gh_origin():
+ """It should return the input url when that origin is not a github one"""
+ url = "https://example.org"
+ assert get_canonical_github_origin_url(url) == url

File Metadata

Mime Type
text/plain
Expires
Mon, Nov 18, 10:42 PM (21 h, 14 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226777

Event Timeline