From 323e2774829842f6dba89bc3f5656642878aab52 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Wed, 5 Jun 2024 15:14:07 +0200
Subject: [PATCH] gitea, gogs: Ensure query parameters are not duplicated in
 API URLs

Gitea API return next pagination link with all query parameters provided
to an API request.

As we were also passing a dict of fixed query parameters to the page_request
method, some query parameters ended up having multiple instances in the URL
for fetching a new page of repositories data. So each time a new page was
requested, new instances of these parameters were appended to the URL which
could result in a really long URL if the number of pages to retrieve is high
and make the request fail.

Also remove a debug log already present in http_request method.
---
 swh/lister/gogs/lister.py            | 14 ++++++++------
 swh/lister/gogs/tests/test_lister.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py
index 5256f4d6..670b50ae 100644
--- a/swh/lister/gogs/lister.py
+++ b/swh/lister/gogs/lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023  The Software Heritage developers
+# Copyright (C) 2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -45,7 +45,6 @@ def _parse_page_id(url: Optional[str]) -> int:
 
 
 class GogsLister(Lister[GogsListerState, GogsListerPage]):
-
     """List origins from the Gogs
 
     Gogs API documentation: https://github.com/gogs/docs-api
@@ -125,10 +124,8 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
         return asdict(state)
 
     def page_request(
-        self, url: str, params: Dict[str, Any]
+        self, url: str, params: Optional[Dict[str, Any]] = None
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        logger.debug("Fetching URL %s with params %s", url, params)
-
         try:
             response = self.http_request(url, params=params)
         except HTTPError as http_error:
@@ -177,7 +174,12 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
             yield GogsListerPage(repos=repos, next_link=next_link)
 
             if next_link is not None:
-                body, links = self.page_request(next_link, self.query_params)
+                parsed_url = urlparse(next_link)
+                query_params = {**self.query_params, **parse_qs(parsed_url.query)}
+                next_link = parsed_url._replace(
+                    query=urlencode(query_params, doseq=True)
+                ).geturl()
+                body, links = self.page_request(next_link)
 
     def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]:
         """Convert a page of Gogs repositories into a list of ListedOrigins"""
diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py
index c219ea77..b586be5a 100644
--- a/swh/lister/gogs/tests/test_lister.py
+++ b/swh/lister/gogs/tests/test_lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022-2023  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -7,6 +7,7 @@ import json
 from pathlib import Path
 from typing import List
 from unittest.mock import Mock
+from urllib.parse import parse_qs, urlparse
 
 import pytest
 from requests import HTTPError
@@ -145,6 +146,13 @@ def test_gogs_full_listing(
         lister.get_state_from_scheduler().last_seen_next_link == P3
     )  # P3 didn't provide any next link so it remains the last_seen_next_link
 
+    # check each query parameter has a single instance in Gogs API URLs
+    for request in requests_mock.request_history:
+        assert all(
+            len(values) == 1
+            for values in parse_qs(urlparse(request.url).query).values()
+        )
+
 
 def test_gogs_auth_instance(
     swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_p3_empty
-- 
GitLab