Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7085396
D7836.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
3 KB
Subscribers
None
D7836.diff
View Options
diff --git a/swh/core/github/__init__.py b/swh/core/github/__init__.py
new file mode 100644
diff --git a/swh/core/github/utils.py b/swh/core/github/utils.py
new file mode 100644
--- /dev/null
+++ b/swh/core/github/utils.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+import re
+from typing import Optional
+
+import requests
+
+GITHUB_PATTERN = re.compile(r"https?://github.com/(?P<user_repo>.*)")
+
+
+def _url_github_html(user_repo: str) -> str:
+ """Given the user repo, returns the expected github html url."""
+ return f"https://github.com/{user_repo}"
+
+
+def _url_github_api(user_repo: str) -> str:
+ """Given the user_repo, returns the expected github api url."""
+ return f"https://api.github.com/repos/{user_repo}"
+
+
+def _sanitize_github_url(url: str) -> str:
+ """Sanitize github url."""
+ return url.lower().rstrip("/").rstrip(".git").rstrip("/")
+
+
+def get_canonical_github_origin_url(url: str) -> Optional[str]:
+ """Retrieve canonical github url out of an url if any or None otherwise.
+
+ This triggers an anonymous http request to the github api url to determine the
+ canonical repository url.
+
+ """
+ url_ = url.lower()
+
+ match = GITHUB_PATTERN.match(url_)
+ if not match:
+ return url
+
+ user_repo = _sanitize_github_url(match.groupdict()["user_repo"])
+ response = requests.get(_url_github_api(user_repo))
+ if response.status_code != 200:
+ return None
+ data = response.json()
+ return data["html_url"]
diff --git a/swh/core/tests/test_github_utils.py b/swh/core/tests/test_github_utils.py
new file mode 100644
--- /dev/null
+++ b/swh/core/tests/test_github_utils.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.core.github.utils import (
+ _sanitize_github_url,
+ _url_github_api,
+ _url_github_html,
+ get_canonical_github_origin_url,
+)
+
+KNOWN_GH_REPO = "https://github.com/user/repo"
+
+
+@pytest.mark.parametrize(
+ "user_repo, expected_url",
+ [
+ ("user/repo.git", KNOWN_GH_REPO),
+ ("user/repo.git/", KNOWN_GH_REPO),
+ ("user/repo/", KNOWN_GH_REPO),
+ ("user/repo", KNOWN_GH_REPO),
+ ("user/repo/.git", KNOWN_GH_REPO),
+ # edge cases
+ ("https://github.com/unknown-page", None), # unknown gh origin returns None
+ ("user/repo/with/some/deps", None), # url kind is not dealt with for now
+ ],
+)
+def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock):
+ """It should return a canonical github origin when it exists, None otherwise"""
+ html_url = _url_github_html(user_repo)
+ api_url = _url_github_api(_sanitize_github_url(user_repo))
+
+ if expected_url is not None:
+ status_code = 200
+ response = {"html_url": _sanitize_github_url(html_url)}
+ else:
+ status_code = 404
+ response = {}
+
+ requests_mock.get(api_url, [{"status_code": status_code, "json": response}])
+
+ assert get_canonical_github_origin_url(html_url) == expected_url
+
+
+def test_get_canonical_github_origin_url_not_gh_origin():
+ """It should return the input url when that origin is not a github one"""
+ url = "https://example.org"
+ assert get_canonical_github_origin_url(url) == url
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Nov 18, 10:42 PM (21 h, 14 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226777
Attached To
D7836: Add utility to sanitize and retrieve canonical github urls
Event Timeline
Log In to Comment