Skip to content
Snippets Groups Projects
Commit 17a219ec authored by vlorentz's avatar vlorentz
Browse files

gitea: Inherit from Gogs lister

This removes code and adds support for incremental pagination.

While both are essentially the same lister now, it still makes sense to
keep the Gitea lister separate, in order to:

1. display them in different categories on https://archive.softwareheritage.org/
2. support possible divergence of APIs in the future
parent dde7865a
No related branches found
No related tags found
No related merge requests found
......@@ -4,27 +4,13 @@
# See top-level LICENSE file for more information
import logging
import random
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
from ..gogs.lister import GogsLister
logger = logging.getLogger(__name__)
RepoListPage = List[Dict[str, Any]]
class GiteaLister(StatelessLister[RepoListPage]):
class GiteaLister(GogsLister):
"""List origins from Gitea.
Gitea API documentation: https://try.gitea.io/api/swagger
......@@ -35,108 +21,7 @@ class GiteaLister(StatelessLister[RepoListPage]):
LISTER_NAME = "gitea"
REPO_LIST_PATH = "repos/search"
def __init__(
self,
scheduler: SchedulerInterface,
url: str,
instance: Optional[str] = None,
api_token: Optional[str] = None,
page_size: int = 50,
credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=url,
instance=instance,
)
self.query_params = {
"sort": "id",
"order": "asc",
"limit": page_size,
"page": 1,
}
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
def on_anonymous_mode(self):
logger.warning(
"No authentication token set in configuration, using anonymous mode"
)
if api_token is None:
if len(self.credentials) > 0:
cred = random.choice(self.credentials)
username = cred.get("username")
api_token = cred["password"]
logger.warning(
"Using authentication token from user %s", username or "???"
)
else:
logger.warning(
"No authentication token set in configuration, using anonymous mode"
)
if api_token:
self.session.headers["Authorization"] = "Token %s" % api_token
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
@classmethod
def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage:
fields_filter = ["id", "clone_url", "updated_at"]
return [{k: r[k] for k in fields_filter} for r in body["data"]]
def get_pages(self) -> Iterator[RepoListPage]:
# base with trailing slash, path without leading slash for urljoin
url: str = urljoin(self.url, self.REPO_LIST_PATH)
response = self.page_request(url, self.query_params)
while True:
page_results = self.results_simplified(response.json())
yield page_results
assert len(response.links) > 0, "API changed: no Link header found"
if "next" in response.links:
url = response.links["next"]["url"]
else:
# last page
break
response = self.page_request(url, {})
def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]:
"""Convert a page of Gitea repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for repo in page:
last_update = iso8601.parse_date(repo["updated_at"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=repo["clone_url"],
visit_type="git",
last_update=last_update,
)
......@@ -10,33 +10,40 @@ from typing import Dict, List, Tuple
import pytest
import requests
from swh.lister.gitea.lister import GiteaLister, RepoListPage
from swh.lister.gitea.lister import GiteaLister
from swh.lister.gogs.lister import GogsListerPage
from swh.scheduler.model import ListedOrigin
TRYGITEA_URL = "https://try.gitea.io/api/v1/"
TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=1"
TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=2"
TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?limit=3&page=1"
TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?limit=3&page=2"
@pytest.fixture
def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]:
def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]:
text = Path(datadir, "https_try.gitea.io", "repos_page1").read_text()
headers = {
"Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=TRYGITEA_P2_URL)
}
page_result = GiteaLister.results_simplified(json.loads(text))
origin_urls = [r["clone_url"] for r in page_result]
page_data = json.loads(text)
page_result = GogsListerPage(
repos=GiteaLister.extract_repos(page_data), next_link=TRYGITEA_P2_URL
)
origin_urls = [r["clone_url"] for r in page_data["data"]]
return text, headers, page_result, origin_urls
@pytest.fixture
def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]:
def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]:
text = Path(datadir, "https_try.gitea.io", "repos_page2").read_text()
headers = {
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=TRYGITEA_P1_URL)
}
page_result = GiteaLister.results_simplified(json.loads(text))
origin_urls = [r["clone_url"] for r in page_result]
page_data = json.loads(text)
page_result = GogsListerPage(
repos=GiteaLister.extract_repos(page_data), next_link=None
)
origin_urls = [r["clone_url"] for r in page_data["data"]]
return text, headers, page_result, origin_urls
......@@ -93,7 +100,9 @@ def test_gitea_full_listing(
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
assert lister.get_state_from_scheduler() is None
lister_state = lister.get_state_from_scheduler()
assert lister_state.last_seen_next_link == TRYGITEA_P2_URL
assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"]
def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1):
......
......@@ -96,7 +96,8 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
"Using authentication credentials from user %s", username or "???"
)
else:
raise ValueError("No credentials or API token provided")
# Raises an error on Gogs, or a warning on Gitea
self.on_anonymous_mode()
self.max_page_limit = 2
......@@ -105,10 +106,15 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
"Authorization": f"token {self.api_token}",
}
)
if self.api_token:
self.session.headers["Authorization"] = f"token {self.api_token}"
def on_anonymous_mode(self):
raise ValueError("No credentials or API token provided")
def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState:
return GogsListerState(**d)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment