Skip to content
Snippets Groups Projects
Verified Commit 92d49426 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

lister: Make sure lister that requires github tokens can use it

Deploying the nixguix lister, I realized that even though the credentials configuration
is properly set for all listers, the listers actually requiring github origin
canonicalization do not have access to the github credentials. It's lost during the
constructor to only focus on the lister's credentials. Which currently translates to
listers being rate-limited.

This commit fixes it by pushing the self.github_session instantiation in the constructor
when the lister explicitely requires the github session. Hence lifting the rate limit
for maven, packagist, nixguix, and github listers.

Related to infra/sysadm-environment#4655
parent 81688ca1
No related branches found
No related tags found
No related merge requests found
......@@ -11,7 +11,7 @@ from urllib.parse import parse_qs, urlparse
import iso8601
from swh.core.github.utils import GitHubSession, MissingRateLimitReset
from swh.core.github.utils import MissingRateLimitReset
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -78,6 +78,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
credentials=credentials,
url=self.API_URL,
instance="github",
with_github_session=True,
)
self.first_id = first_id
......@@ -85,11 +86,6 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
self.relisting = self.first_id is not None or self.last_id is not None
self.github_session = GitHubSession(
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
return GitHubListerState(**d)
......@@ -109,6 +105,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
logger.debug("Getting page %s", current_url)
try:
assert self.github_session is not None
response = self.github_session.request(current_url)
except MissingRateLimitReset:
# Give up
......
......@@ -141,7 +141,7 @@ def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> Non
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
lister = GitHubLister(scheduler=swh_scheduler)
assert lister.github_session.anonymous
assert lister.github_session is not None and lister.github_session.anonymous
assert "using anonymous mode" in caplog.records[-1].message
caplog.clear()
......
......@@ -14,7 +14,6 @@ from bs4 import BeautifulSoup
import lxml
import requests
from swh.core.github.utils import GitHubSession
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -88,15 +87,12 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
credentials=credentials,
url=url,
instance=instance,
with_github_session=True,
)
self.session.headers.update({"Accept": "application/json"})
self.jar_origins: Dict[str, ListedOrigin] = {}
self.github_session = GitHubSession(
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
return MavenListerState(**d)
......@@ -294,6 +290,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
return None
if url and visit_type == "git":
assert self.github_session is not None
# Non-github urls will be returned as is, github ones will be canonical ones
url = self.github_session.get_canonical_url(url)
......
......@@ -29,7 +29,6 @@ from urllib.parse import parse_qsl, urlparse
import requests
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
from swh.core.github.utils import GitHubSession
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.pattern import CredentialsType, StatelessLister
......@@ -331,6 +330,7 @@ class NixGuixLister(StatelessLister[PageResult]):
url=url.rstrip("/"),
instance=instance,
credentials=credentials,
with_github_session=canonicalize,
)
# either full fqdn NixOS/nixpkgs or guix repository urls
# maybe add an assert on those specific urls?
......@@ -338,16 +338,6 @@ class NixGuixLister(StatelessLister[PageResult]):
self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
self.session = requests.Session()
# for testing purposes, we may want to skip this step (e.g. docker run and rate
# limit)
self.github_session = (
GitHubSession(
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
if canonicalize
else None
)
def build_artifact(
self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None
......
......@@ -11,7 +11,6 @@ from typing import Any, Dict, Iterator, List, Optional
import iso8601
import requests
from swh.core.github.utils import GitHubSession
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
......@@ -60,14 +59,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
url=self.PACKAGIST_PACKAGES_LIST_URL,
instance="packagist",
credentials=credentials,
with_github_session=True,
)
self.session.headers.update({"Accept": "application/json"})
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
self.github_session = GitHubSession(
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
last_listing_date = d.get("last_listing_date")
......@@ -152,6 +148,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
if visit_type == "git":
# Non-github urls will be returned as is, github ones will be canonical
# ones
assert self.github_session is not None
origin_url = (
self.github_session.get_canonical_url(origin_url) or origin_url
)
......
......@@ -14,6 +14,7 @@ import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.config import load_from_envvar
from swh.core.github.utils import GitHubSession
from swh.core.utils import grouper
from swh.scheduler import get_scheduler, model
from swh.scheduler.interface import SchedulerInterface
......@@ -93,6 +94,7 @@ class Lister(Generic[StateType, PageType]):
"""
LISTER_NAME: str = ""
github_session: Optional[GitHubSession] = None
def __init__(
self,
......@@ -100,6 +102,7 @@ class Lister(Generic[StateType, PageType]):
url: str,
instance: Optional[str] = None,
credentials: CredentialsType = None,
with_github_session: bool = False,
):
if not self.LISTER_NAME:
raise ValueError("Must set the LISTER_NAME attribute on Lister classes")
......@@ -127,6 +130,14 @@ class Lister(Generic[StateType, PageType]):
self.session.headers.update(
{"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME}
)
self.github_session: Optional[GitHubSession] = (
GitHubSession(
credentials=credentials.get("github", {}).get("github", []),
user_agent=str(self.session.headers["User-Agent"]),
)
if with_github_session
else None
)
self.recorded_origins: Set[str] = set()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment