Skip to content
Snippets Groups Projects
Commit 3cc28eb7 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

github: Add get_repository_metadata method to GitHubSession class

It returns the full JSON metadata of a GitHub repository by querying the
GitHub REST API.

Also refactor get_canonical_url method with it.

Related to swh/devel/swh-loader-git#3652.
parent 77a91293
No related branches found
No related tags found
1 merge request!349github: Add get_repository_metadata method to GitHubSession class
Pipeline #2104 passed
# Copyright (C) 2023 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -8,6 +8,7 @@ import logging
from unittest.mock import call
import pytest
import requests
from swh.core.github.pytest_plugin import HTTP_GITHUB_API_URL
from swh.core.github.utils import (
......@@ -359,3 +360,21 @@ def test_github_session_ratelimit_reset_sleep_anonymous(
"api_type": "github",
"api_instance": "github",
}
def test_github_session_get_repo_metadata_success(requests_mock):
user_repo = KNOWN_GH_REPO.replace("https://github.com/", "")
repo_metadata = {"html_url": KNOWN_GH_REPO}
requests_mock.get(_url_github_api(user_repo), json=repo_metadata)
gh_session = GitHubSession(user_agent="GitHub Session Test")
assert gh_session.get_repository_metadata(KNOWN_GH_REPO) == repo_metadata
def test_github_session_get_repo_metadata_failure(requests_mock):
unknown_user_repo = KNOWN_GH_REPO2.replace("https://github.com/", "")
requests_mock.get(_url_github_api(unknown_user_repo), status_code=404)
gh_session = GitHubSession(user_agent="GitHub Session Test")
with pytest.raises(requests.HTTPError):
gh_session.get_repository_metadata(KNOWN_GH_REPO2)
......@@ -8,7 +8,7 @@ import logging
import random
import re
import time
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional
import requests
from tenacity import (
......@@ -259,24 +259,47 @@ class GitHubSession:
self.statsd.increment("sleep_seconds_total", sleep_time)
time.sleep(sleep_time)
def get_canonical_url(self, url: str) -> Optional[str]:
"""Retrieve canonical github url out of an url if any or None otherwise.
def get_repository_metadata(self, repo_url: str) -> Optional[Dict[str, Any]]:
"""Retrieve metadata of a repository from the github API.
This triggers an http request to the github api url to determine the
canonical repository url.
Args:
repo_url: URL of a github repository
Returns
The canonical url if any, None otherwise.
Returns:
A dictionary holding the metadata of the repository or None
if this is not a valid github repository.
Throws:
requests.HTTPError: if the request to the github API failed.
"""
url_ = url.lower()
url = repo_url.lower()
match = GITHUB_PATTERN.match(url_)
match = GITHUB_PATTERN.match(url)
if not match:
return url
return None
user_repo = _sanitize_github_url(match.groupdict()["user_repo"])
response = self.request(_url_github_api(user_repo))
if response.status_code != 200:
response.raise_for_status()
return response.json()
def get_canonical_url(self, repo_url: str) -> Optional[str]:
"""Retrieve canonical github url out of a github url.
This triggers an http request to the github api url to determine the
canonical repository url.
Args:
repo_url: URL of a github repository
Returns:
The canonical github url, the input url if it is not a github one,
None otherwise.
"""
try:
metadata = self.get_repository_metadata(repo_url)
return metadata.get("html_url") if metadata else repo_url
except requests.HTTPError:
# invalid github repository
return None
data = response.json()
return data["html_url"]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment