Skip to content
Snippets Groups Projects
Verified Commit 60f384d3 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Add utility function to retrieve canonical github urls

This new code is within a new arborescence as some more code will get moved alongside in
a new commit (the new session github currently in swh.lister module).

That current code is making anonymous requests to the github api for now.

Related to T4232
parent dae96344
No related branches found
No related tags found
No related merge requests found
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
from typing import Optional
import requests
GITHUB_PATTERN = re.compile(r"https?://github.com/(?P<user_repo>.*)")
def _url_github_html(user_repo: str) -> str:
"""Given the user repo, returns the expected github html url."""
return f"https://github.com/{user_repo}"
def _url_github_api(user_repo: str) -> str:
"""Given the user_repo, returns the expected github api url."""
return f"https://api.github.com/repos/{user_repo}"
def _sanitize_github_url(url: str) -> str:
"""Sanitize github url."""
return url.lower().rstrip("/").rstrip(".git").rstrip("/")
def get_canonical_github_origin_url(url: str) -> Optional[str]:
"""Retrieve canonical github url out of an url if any or None otherwise.
This triggers an anonymous http request to the github api url to determine the
canonical repository url.
"""
url_ = url.lower()
match = GITHUB_PATTERN.match(url_)
if not match:
return url
user_repo = _sanitize_github_url(match.groupdict()["user_repo"])
response = requests.get(_url_github_api(user_repo))
if response.status_code != 200:
return None
data = response.json()
return data["html_url"]
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.core.github.utils import (
_sanitize_github_url,
_url_github_api,
_url_github_html,
get_canonical_github_origin_url,
)
KNOWN_GH_REPO = "https://github.com/user/repo"
@pytest.mark.parametrize(
"user_repo, expected_url",
[
("user/repo.git", KNOWN_GH_REPO),
("user/repo.git/", KNOWN_GH_REPO),
("user/repo/", KNOWN_GH_REPO),
("user/repo", KNOWN_GH_REPO),
("user/repo/.git", KNOWN_GH_REPO),
# edge cases
("https://github.com/unknown-page", None), # unknown gh origin returns None
("user/repo/with/some/deps", None), # url kind is not dealt with for now
],
)
def test_get_canonical_github_origin_url(user_repo, expected_url, requests_mock):
"""It should return a canonical github origin when it exists, None otherwise"""
html_url = _url_github_html(user_repo)
api_url = _url_github_api(_sanitize_github_url(user_repo))
if expected_url is not None:
status_code = 200
response = {"html_url": _sanitize_github_url(html_url)}
else:
status_code = 404
response = {}
requests_mock.get(api_url, [{"status_code": status_code, "json": response}])
assert get_canonical_github_origin_url(html_url) == expected_url
def test_get_canonical_github_origin_url_not_gh_origin():
"""It should return the input url when that origin is not a github one"""
url = "https://example.org"
assert get_canonical_github_origin_url(url) == url
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment